encoding_types/
lib.rs

1// This is a part of encoding-next.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5/*!
6 * Interface to the character encoding.
7 *
8 * # Raw incremental interface
9 *
10 * Methods which name starts with `raw_` constitute the raw incremental interface,
11 * the lowest-available API for encoders and decoders.
12 * This interface divides the entire input to four parts:
13 *
14 * - **Processed** bytes do not affect the future result.
15 * - **Unprocessed** bytes may affect the future result
16 *   and can be a part of problematic sequence according to the future input.
17 * - **Problematic** byte is the first byte that causes an error condition.
18 * - **Remaining** bytes are not yet processed nor read,
19 *   so the caller should feed any remaining bytes again.
20 *
21 * The following figure illustrates an example of successive `raw_feed` calls:
22 *
23 * ````notrust
24 * 1st raw_feed   :2nd raw_feed   :3rd raw_feed
25 * ----------+----:---------------:--+--+---------
26 *           |    :               :  |  |
27 * ----------+----:---------------:--+--+---------
28 * processed  unprocessed             |  remaining
29 *                               problematic
30 * ````
31 *
32 * Since these parts can span the multiple input sequences to `raw_feed`,
33 * `raw_feed` returns two offsets (one optional)
34 * with that the caller can track the problematic sequence.
35 * The first offset (the first `usize` in the tuple) points to the first unprocessed bytes,
36 * or is zero when unprocessed bytes have started before the current call.
37 * (The first unprocessed byte can also be at offset 0,
38 * which doesn't make a difference for the caller.)
39 * The second offset (`upto` field in the `CodecError` struct), if any,
40 * points to the first remaining bytes.
41 *
42 * If the caller needs to recover the error via the problematic sequence,
43 * then the caller starts to save the unprocessed bytes when the first offset < the input length,
44 * appends any new unprocessed bytes while the first offset is zero,
45 * and discards unprocessed bytes when first offset becomes non-zero
46 * while saving new unprocessed bytes when the first offset < the input length.
47 * Then the caller checks for the error condition
48 * and can use the saved unprocessed bytes for error recovery.
49 * Alternatively, if the caller only wants to replace the problematic sequence
50 * with a fixed string (like U+FFFD),
51 * then it can just discard the first sequence and can emit the fixed string on an error.
52 * It still has to feed the input bytes starting at the second offset again.
53 */
54
55use std::borrow::Cow;
56use std::fmt;
57
58/// Error information from either encoder or decoder.
59pub struct CodecError {
60    /// The byte position of the first remaining byte, with respect to the *current* input.
61    /// For the `finish` call, this should be no more than zero (since there is no input).
62    /// It can be negative if the remaining byte is in the prior inputs,
63    /// as long as the remaining byte is not yet processed.
64    /// The caller should feed the bytes starting from this point again
65    /// in order to continue encoding or decoding after an error.
66    pub upto: isize,
67    /// A human-readable cause of the error.
68    pub cause: Cow<'static, str>,
69}
70
71/// Byte writer used by encoders. In most cases this will be an owned vector of `u8`.
72pub trait ByteWriter {
73    /// Hints an expected lower bound on the length (in bytes) of the output
74    /// until the next call to `writer_hint`,
75    /// so that the writer can reserve the memory for writing.
76    /// `RawEncoder`s are recommended but not required to call this method
77    /// with an appropriate estimate.
78    /// By default this method does nothing.
79    fn writer_hint(&mut self, _expectedlen: usize) {}
80
81    /// Writes a single byte.
82    fn write_byte(&mut self, b: u8);
83
84    /// Writes a number of bytes.
85    fn write_bytes(&mut self, v: &[u8]);
86
87    /// If this `ByteWriter` is a `Vec<u8>`, returns a mutable reference to
88    /// `self` as `Some(&mut Vec<u8>)`. Returns `None` otherwise.
89    fn as_mut_vec(&mut self) -> Option<&mut Vec<u8>> {
90        None
91    }
92}
93
94impl ByteWriter for Vec<u8> {
95    fn writer_hint(&mut self, expectedlen: usize) {
96        self.reserve(expectedlen);
97    }
98
99    fn write_byte(&mut self, b: u8) {
100        self.push(b);
101    }
102
103    fn write_bytes(&mut self, v: &[u8]) {
104        self.extend_from_slice(v);
105    }
106
107    fn as_mut_vec(&mut self) -> Option<&mut Vec<u8>> {
108        Some(self)
109    }
110}
111
112/// String writer used by decoders. In most cases this will be an owned string.
113pub trait StringWriter {
114    /// Hints an expected lower bound on the length (in bytes) of the output
115    /// until the next call to `writer_hint`,
116    /// so that the writer can reserve the memory for writing.
117    /// `RawDecoder`s are recommended but not required to call this method
118    /// with an appropriate estimate.
119    /// By default this method does nothing.
120    fn writer_hint(&mut self, _expectedlen: usize) {}
121
122    /// Writes a single character.
123    fn write_char(&mut self, c: char);
124
125    /// Writes a string.
126    fn write_str(&mut self, s: &str);
127
128    /// If this `StringWriter` is a `String`, returns a mutable reference to
129    /// `self` as `Some(&mut String)`. Returns `None` otherwise.
130    fn as_mut_string(&mut self) -> Option<&mut String> {
131        None
132    }
133}
134
135impl StringWriter for String {
136    fn writer_hint(&mut self, expectedlen: usize) {
137        self.reserve(expectedlen);
138    }
139
140    fn write_char(&mut self, c: char) {
141        self.push(c);
142    }
143
144    fn write_str(&mut self, s: &str) {
145        self.push_str(s);
146    }
147
148    fn as_mut_string(&mut self) -> Option<&mut String> {
149        Some(self)
150    }
151}
152
153/// Encoder converting a Unicode string into a byte sequence.
154/// This is a lower level interface, and normally `Encoding::encode` should be used instead.
155pub trait RawEncoder: Send + 'static {
156    /// Creates a fresh `RawEncoder` instance which parameters are same as `self`.
157    #[allow(clippy::wrong_self_convention)]
158    fn from_self(&self) -> Box<dyn RawEncoder>;
159
160    /// Returns true if this encoding is compatible to ASCII,
161    /// i.e. U+0000 through U+007F always map to bytes 00 through 7F and nothing else.
162    fn is_ascii_compatible(&self) -> bool {
163        false
164    }
165
166    /// Feeds given portion of string to the encoder,
167    /// pushes the an encoded byte sequence at the end of the given output,
168    /// and returns a byte offset to the first unprocessed character
169    /// (that can be zero when the first such character appeared in the prior calls to `raw_feed`)
170    /// and optional error information (None means success).
171    fn raw_feed(&mut self, input: &str, output: &mut dyn ByteWriter)
172        -> (usize, Option<CodecError>);
173
174    /// Finishes the encoder,
175    /// pushes the an encoded byte sequence at the end of the given output,
176    /// and returns optional error information (None means success).
177    /// `remaining` value of the error information, if any, is always an empty string.
178    fn raw_finish(&mut self, output: &mut dyn ByteWriter) -> Option<CodecError>;
179}
180
181/// Decoder converting a byte sequence into a Unicode string.
182/// This is a lower level interface, and normally `Encoding::decode` should be used instead.
183pub trait RawDecoder: Send + 'static {
184    /// Creates a fresh `RawDecoder` instance which parameters are same as `self`.
185    #[allow(clippy::wrong_self_convention)]
186    fn from_self(&self) -> Box<dyn RawDecoder>;
187
188    /// Returns true if this encoding is compatible to ASCII,
189    /// i.e. bytes 00 through 7F always map to U+0000 through U+007F and nothing else.
190    fn is_ascii_compatible(&self) -> bool {
191        false
192    }
193
194    /// Feeds given portion of byte sequence to the encoder,
195    /// pushes the a decoded string at the end of the given output,
196    /// and returns an offset to the first unprocessed byte
197    /// (that can be zero when the first such byte appeared in the prior calls to `raw_feed`)
198    /// and optional error information (None means success).
199    fn raw_feed(
200        &mut self,
201        input: &[u8],
202        output: &mut dyn StringWriter,
203    ) -> (usize, Option<CodecError>);
204
205    /// Finishes the decoder,
206    /// pushes the a decoded string at the end of the given output,
207    /// and returns optional error information (None means success).
208    fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError>;
209}
210
211/// A trait object using dynamic dispatch which is a sendable reference to the encoding,
212/// for code where the encoding is not known at compile-time.
213pub type EncodingRef = &'static (dyn Encoding + Send + Sync);
214
215/// Character encoding.
216pub trait Encoding {
217    /// Returns the canonical name of given encoding.
218    /// This name is guaranteed to be unique across built-in encodings,
219    /// but it is not normative and would be at most arbitrary.
220    fn name(&self) -> &'static str;
221
222    /// Returns a name of given encoding defined in the WHATWG Encoding standard, if any.
223    /// This name often differs from `name` due to the compatibility reason.
224    fn whatwg_name(&self) -> Option<&'static str> {
225        None
226    }
227
228    /// Creates a new encoder.
229    fn raw_encoder(&self) -> Box<dyn RawEncoder>;
230
231    /// Creates a new decoder.
232    fn raw_decoder(&self) -> Box<dyn RawDecoder>;
233
234    /// An easy-to-use interface to `RawEncoder`.
235    /// On the encoder error `trap` is called,
236    /// which may return a replacement sequence to continue processing,
237    /// or a failure to return the error.
238    fn encode(&self, input: &str, trap: EncoderTrap) -> Result<Vec<u8>, Cow<'static, str>> {
239        let mut ret = Vec::new();
240        self.encode_to(input, trap, &mut ret).map(|_| ret)
241    }
242
243    /// Encode into a `dyn ByteWriter`.
244    fn encode_to(
245        &self,
246        input: &str,
247        trap: EncoderTrap,
248        ret: &mut dyn ByteWriter,
249    ) -> Result<(), Cow<'static, str>> {
250        // we don't need to keep `unprocessed` here;
251        // `raw_feed` should process as much input as possible.
252        let mut encoder = self.raw_encoder();
253        let mut remaining = 0;
254
255        loop {
256            let (offset, err) = encoder.raw_feed(&input[remaining..], ret);
257            let unprocessed = remaining + offset;
258            match err {
259                Some(err) => {
260                    remaining = (remaining as isize + err.upto) as usize;
261                    if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) {
262                        return Err(err.cause);
263                    }
264                }
265                None => {
266                    remaining = input.len();
267                    match encoder.raw_finish(ret) {
268                        Some(err) => {
269                            remaining = (remaining as isize + err.upto) as usize;
270                            if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) {
271                                return Err(err.cause);
272                            }
273                        }
274                        None => {}
275                    }
276                    if remaining >= input.len() {
277                        return Ok(());
278                    }
279                }
280            }
281        }
282    }
283
284    /// An easy-to-use interface to `RawDecoder`.
285    /// On the decoder error `trap` is called,
286    /// which may return a replacement string to continue processing,
287    /// or a failure to return the error.
288    fn decode(&self, input: &[u8], trap: DecoderTrap) -> Result<String, Cow<'static, str>> {
289        let mut ret = String::new();
290        self.decode_to(input, trap, &mut ret).map(|_| ret)
291    }
292
293    /// Decode into a `StringWriter`.
294    ///
295    /// This does *not* handle partial characters at the beginning or end of `input`!
296    /// Use `RawDecoder` for incremental decoding.
297    fn decode_to(
298        &self,
299        input: &[u8],
300        trap: DecoderTrap,
301        ret: &mut dyn StringWriter,
302    ) -> Result<(), Cow<'static, str>> {
303        // we don't need to keep `unprocessed` here;
304        // `raw_feed` should process as much input as possible.
305        let mut decoder = self.raw_decoder();
306        let mut remaining = 0;
307
308        loop {
309            let (offset, err) = decoder.raw_feed(&input[remaining..], ret);
310            let unprocessed = remaining + offset;
311            match err {
312                Some(err) => {
313                    remaining = (remaining as isize + err.upto) as usize;
314                    if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) {
315                        return Err(err.cause);
316                    }
317                }
318                None => {
319                    remaining = input.len();
320                    match decoder.raw_finish(ret) {
321                        Some(err) => {
322                            remaining = (remaining as isize + err.upto) as usize;
323                            if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) {
324                                return Err(err.cause);
325                            }
326                        }
327                        None => {}
328                    }
329                    if remaining >= input.len() {
330                        return Ok(());
331                    }
332                }
333            }
334        }
335    }
336}
337
338impl<'a> fmt::Debug for &'a dyn Encoding {
339    fn fmt(&self, fmt: &mut fmt::Formatter) -> Result<(), fmt::Error> {
340        fmt.write_str("Encoding(")?;
341        fmt.write_str(self.name())?;
342        fmt.write_str(")")?;
343        Ok(())
344    }
345}
346
347/// A type of the bare function in `EncoderTrap` values.
348pub type EncoderTrapFunc =
349    fn(encoder: &mut dyn RawEncoder, input: &str, output: &mut dyn ByteWriter) -> bool;
350
351/// A type of the bare function in `DecoderTrap` values.
352pub type DecoderTrapFunc =
353    fn(decoder: &mut dyn RawDecoder, input: &[u8], output: &mut dyn StringWriter) -> bool;
354
355/// Trap, which handles decoder errors.
356#[derive(Copy)]
357pub enum DecoderTrap {
358    /// Immediately fails on errors.
359    /// Corresponds to WHATWG "fatal" error algorithm.
360    Strict,
361    /// Replaces an error with a U+FFFD (decoder).
362    /// Corresponds to WHATWG "replacement" error algorithm.
363    Replace,
364    /// Silently ignores an error, effectively replacing it with an empty sequence.
365    Ignore,
366    /// Calls given function to handle decoder errors.
367    /// The function is given the current decoder, input and output writer,
368    /// and should return true only when it is fine to keep going.
369    Call(DecoderTrapFunc),
370}
371
372impl DecoderTrap {
373    /// Handles a decoder error. May write to the output writer.
374    /// Returns true only when it is fine to keep going.
375    pub fn trap(
376        &self,
377        decoder: &mut dyn RawDecoder,
378        input: &[u8],
379        output: &mut dyn StringWriter,
380    ) -> bool {
381        match *self {
382            DecoderTrap::Strict => false,
383            DecoderTrap::Replace => {
384                output.write_char('\u{fffd}');
385                true
386            }
387            DecoderTrap::Ignore => true,
388            DecoderTrap::Call(func) => func(decoder, input, output),
389        }
390    }
391}
392
393impl Clone for DecoderTrap {
394    fn clone(&self) -> DecoderTrap {
395        match *self {
396            DecoderTrap::Strict => DecoderTrap::Strict,
397            DecoderTrap::Replace => DecoderTrap::Replace,
398            DecoderTrap::Ignore => DecoderTrap::Ignore,
399            DecoderTrap::Call(f) => DecoderTrap::Call(f),
400        }
401    }
402}
403
404#[derive(Copy)]
405pub enum EncoderTrap {
406    /// Immediately fails on errors.
407    /// Corresponds to WHATWG "fatal" error algorithm.
408    Strict,
409    /// Replaces an error with `?` in given encoding.
410    /// Note that this fails when `?` cannot be represented in given encoding.
411    /// Corresponds to WHATWG "URL" error algorithms.
412    Replace,
413    /// Silently ignores an error, effectively replacing it with an empty sequence.
414    Ignore,
415    /// Replaces an error with XML numeric character references (e.g. `&#1234;`).
416    /// The encoder trap fails when NCRs cannot be represented in given encoding.
417    /// Corresponds to WHATWG "<form>" error algorithms.
418    NcrEscape,
419    /// Calls given function to handle encoder errors.
420    /// The function is given the current encoder, input and output writer,
421    /// and should return true only when it is fine to keep going.
422    Call(EncoderTrapFunc),
423}
424
425impl EncoderTrap {
426    /// Handles an encoder error. May write to the output writer.
427    /// Returns true only when it is fine to keep going.
428    pub fn trap(
429        &self,
430        encoder: &mut dyn RawEncoder,
431        input: &str,
432        output: &mut dyn ByteWriter,
433    ) -> bool {
434        fn reencode(
435            encoder: &mut dyn RawEncoder,
436            input: &str,
437            output: &mut dyn ByteWriter,
438            trapname: &str,
439        ) -> bool {
440            if encoder.is_ascii_compatible() {
441                // optimization!
442                output.write_bytes(input.as_bytes());
443            } else {
444                let (_, err) = encoder.raw_feed(input, output);
445                if err.is_some() {
446                    panic!("{} cannot reencode a replacement string", trapname);
447                }
448            }
449            true
450        }
451
452        match *self {
453            EncoderTrap::Strict => false,
454            EncoderTrap::Replace => reencode(encoder, "?", output, "Replace"),
455            EncoderTrap::Ignore => true,
456            EncoderTrap::NcrEscape => {
457                let mut escapes = String::new();
458                for ch in input.chars() {
459                    escapes.push_str(&format!("&#{};", ch as isize));
460                }
461                reencode(encoder, &escapes, output, "NcrEscape")
462            }
463            EncoderTrap::Call(func) => func(encoder, input, output),
464        }
465    }
466}
467
468impl Clone for EncoderTrap {
469    fn clone(&self) -> EncoderTrap {
470        match *self {
471            EncoderTrap::Strict => EncoderTrap::Strict,
472            EncoderTrap::Replace => EncoderTrap::Replace,
473            EncoderTrap::Ignore => EncoderTrap::Ignore,
474            EncoderTrap::NcrEscape => EncoderTrap::NcrEscape,
475            EncoderTrap::Call(f) => EncoderTrap::Call(f),
476        }
477    }
478}