encoding/lib.rs
1// This is a part of encoding-next.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! # Encoding
6//!
7//! Character encoding support for Rust.
8//! It is based on [WHATWG Encoding Standard](http://encoding.spec.whatwg.org/),
9//! and also provides an advanced interface for error detection and recovery.
10//!
11//! ## Usage
12//!
13//! Put this in your `Cargo.toml`:
14//!
15//! ```toml
16//! [dependencies]
17//! encoding = "0.3"
18//! ```
19//!
20//! ### Data Table
21//!
22//! By default, Encoding comes with ~480 KB of data table ("indices").
23//! This allows Encoding to encode and decode legacy encodings efficiently,
24//! but this might not be desirable for some applications.
25//!
26//! Encoding provides the `no-optimized-legacy-encoding` Cargo feature
27//! to reduce the size of encoding tables (to ~185 KB)
28//! at the expense of encoding performance (typically 5x to 20x slower).
29//! The decoding performance remains identical.
30//! **This feature is strongly intended for end users.
31//! Do not enable this feature from library crates, ever.**
32//!
33//! For finer-tuned optimization, see `src/index/gen_index.py` for
34//! custom table generation. At the most reduced (and slowest) setting,
35//! the minimal size of data table is about 160 KB.
36//!
37//! ## Overview
38//!
39//! To encode a string:
40//!
41//! ```rust
42//! use encoding::{Encoding, EncoderTrap};
43//! use encoding::all::ISO_8859_1;
44//!
45//! assert_eq!(ISO_8859_1.encode("caf\u{e9}", EncoderTrap::Strict),
46//! Ok(vec![99,97,102,233]));
47//! ```
48//!
49//! To encode a string with unrepresentable characters:
50//!
51//! ```rust
52//! use encoding::{Encoding, EncoderTrap};
53//! use encoding::all::ISO_8859_2;
54//!
55//! assert!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Strict).is_err());
56//! assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Replace),
57//! Ok(vec![65,99,109,101,63]));
58//! assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Ignore),
59//! Ok(vec![65,99,109,101]));
60//! assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::NcrEscape),
61//! Ok(vec![65,99,109,101,38,35,49,54,57,59]));
62//! ```
63//!
64//! To decode a byte sequence:
65//!
66//! ```rust
67//! use encoding::{Encoding, DecoderTrap};
68//! use encoding::all::ISO_8859_1;
69//!
70//! assert_eq!(ISO_8859_1.decode(&[99,97,102,233], DecoderTrap::Strict),
71//! Ok("caf\u{e9}".to_string()));
72//! ```
73//!
74//! To decode a byte sequence with invalid sequences:
75//!
76//! ```rust
77//! use encoding::{Encoding, DecoderTrap};
78//! use encoding::all::ISO_8859_6;
79//!
80//! assert!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Strict).is_err());
81//! assert_eq!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Replace),
82//! Ok("Acme\u{fffd}".to_string()));
83//! assert_eq!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Ignore),
84//! Ok("Acme".to_string()));
85//! ```
86//!
87//! To encode or decode the input into the already allocated buffer:
88//!
89//! ```rust
90//! use encoding::{Encoding, EncoderTrap, DecoderTrap};
91//! use encoding::all::{ISO_8859_2, ISO_8859_6};
92//!
93//! let mut bytes = Vec::new();
94//! let mut chars = String::new();
95//!
96//! assert!(ISO_8859_2.encode_to("Acme\u{a9}", EncoderTrap::Ignore, &mut bytes).is_ok());
97//! assert!(ISO_8859_6.decode_to(&[65,99,109,101,169], DecoderTrap::Replace, &mut chars).is_ok());
98//!
99//! assert_eq!(bytes, [65,99,109,101]);
100//! assert_eq!(chars, "Acme\u{fffd}");
101//! ```
102//!
103//! A practical example of custom encoder traps:
104//!
105//! ```rust
106//! use encoding::{Encoding, ByteWriter, EncoderTrap, DecoderTrap};
107//! use encoding::types::RawEncoder;
108//! use encoding::all::ASCII;
109//!
110//! // hexadecimal numeric character reference replacement
111//! fn hex_ncr_escape(_encoder: &mut dyn RawEncoder, input: &str, output: &mut dyn ByteWriter) -> bool {
112//! let escapes: Vec<String> =
113//! input.chars().map(|ch| format!("&#x{:x};", ch as isize)).collect();
114//! let escapes = escapes.concat();
115//! output.write_bytes(escapes.as_bytes());
116//! true
117//! }
118//! static HEX_NCR_ESCAPE: EncoderTrap = EncoderTrap::Call(hex_ncr_escape);
119//!
120//! let orig = "Hello, 世界!".to_string();
121//! let encoded = ASCII.encode(&orig, HEX_NCR_ESCAPE).unwrap();
122//! assert_eq!(ASCII.decode(&encoded, DecoderTrap::Strict),
123//! Ok("Hello, 世界!".to_string()));
124//! ```
125//!
126//! Getting the encoding from the string label, as specified in WHATWG Encoding standard:
127//!
128//! ```rust
129//! use encoding::{Encoding, DecoderTrap};
130//! use encoding::label::encoding_from_whatwg_label;
131//! use encoding::all::WINDOWS_949;
132//!
133//! let euckr = encoding_from_whatwg_label("euc-kr").unwrap();
134//! assert_eq!(euckr.name(), "windows-949");
135//! assert_eq!(euckr.whatwg_name(), Some("euc-kr")); // for the sake of compatibility
136//! let broken = &[0xbf, 0xec, 0xbf, 0xcd, 0xff, 0xbe, 0xd3];
137//! assert_eq!(euckr.decode(broken, DecoderTrap::Replace),
138//! Ok("\u{c6b0}\u{c640}\u{fffd}\u{c559}".to_string()));
139//!
140//! // corresponding Encoding native API:
141//! assert_eq!(WINDOWS_949.decode(broken, DecoderTrap::Replace),
142//! Ok("\u{c6b0}\u{c640}\u{fffd}\u{c559}".to_string()));
143//! ```
144//!
145//! ## Types and Stuffs
146//!
147//! There are three main entry points to Encoding.
148//!
149//! **`Encoding`** is a single character encoding.
150//! It contains `encode` and `decode` methods for converting `String` to `Vec<u8>` and vice versa.
151//! For the error handling, they receive **traps** (`EncoderTrap` and `DecoderTrap` respectively)
152//! which replace any error with some string (e.g. `U+FFFD`) or sequence (e.g. `?`).
153//! You can also use `EncoderTrap::Strict` and `DecoderTrap::Strict` traps to stop on an error.
154//!
155//! There are two ways to get `Encoding`:
156//!
157//! * `encoding::all` has static items for every supported encoding.
158//! You should use them when the encoding would not change or only handful of them are required.
159//! Combined with link-time optimization, any unused encoding would be discarded from the binary.
160//! * `encoding::label` has functions to dynamically get an encoding from given string ("label").
161//! They will return a static reference to the encoding,
162//! which type is also known as `EncodingRef`.
163//! It is useful when a list of required encodings is not available in advance,
164//! but it will result in the larger binary and missed optimization opportunities.
165//!
166//! **`RawEncoder`** is an experimental incremental encoder.
167//! At each step of `raw_feed`, it receives a slice of string
168//! and emits any encoded bytes to a generic `dyn ByteWriter` (normally `Vec<u8>`).
169//! It will stop at the first error if any, and would return a `CodecError` struct in that case.
170//! The caller is responsible for calling `raw_finish` at the end of encoding process.
171//!
172//! **`RawDecoder`** is an experimental incremental decoder.
173//! At each step of `raw_feed`, it receives a slice of byte sequence
174//! and emits any decoded characters to a generic `StringWriter` (normally `String`).
175//! Otherwise it is identical to `RawEncoder`s.
176//!
177//! One should prefer `Encoding::{encode,decode}` as a primary interface.
178//! `RawEncoder` and `RawDecoder` is experimental and can change substantially.
179//! See the additional documents on `encoding::types` module for more information on them.
180//!
181//! ## Supported Encodings
182//!
183//! Encoding covers all encodings specified by WHATWG Encoding Standard and some more:
184//!
185//! * 7-bit strict ASCII (`ascii`)
186//! * UTF-8 (`utf-8`)
187//! * UTF-16 in little endian (`utf-16` or `utf-16le`) and big endian (`utf-16be`)
188//! * All single byte encoding in WHATWG Encoding Standard:
189//! * IBM code page 866
190//! * ISO 8859-{2,3,4,5,6,7,8,10,13,14,15,16}
191//! * KOI8-R, KOI8-U
192//! * MacRoman (`macintosh`), Macintosh Cyrillic encoding (`x-mac-cyrillic`)
193//! * Windows code pages 874, 1250, 1251, 1252 (instead of ISO 8859-1), 1253,
194//! 1254 (instead of ISO 8859-9), 1255, 1256, 1257, 1258
195//! * All multi byte encodings in WHATWG Encoding Standard:
196//! * Windows code page 949 (`euc-kr`, since the strict EUC-KR is hardly used)
197//! * EUC-JP and Windows code page 932 (`shift_jis`,
198//! since it's the most widespread extension to Shift_JIS)
199//! * ISO-2022-JP with asymmetric JIS X 0212 support
200//! (Note: this is not yet up to date to the current standard)
201//! * GBK
202//! * GB 18030
203//! * Big5-2003 with HKSCS-2008 extensions
204//! * Encodings that were originally specified by WHATWG Encoding Standard:
205//! * HZ
206//! * ISO 8859-1 (distinct from Windows code page 1252)
207//! * ArmSCII-8 (`armscii-8`)
208//! * Code page 437 (`cp437`)
209//!
210//! Parenthesized names refer to the encoding's primary name assigned by WHATWG Encoding Standard.
211//!
212//! Many legacy character encodings lack the proper specification,
213//! and even those that have a specification are highly dependent of the actual implementation.
214//! Consequently one should be careful when picking a desired character encoding.
215//! The only standards reliable in this regard are WHATWG Encoding Standard and
216//! [vendor-provided mappings from the Unicode consortium](http://www.unicode.org/Public/MAPPINGS/).
217//! Whenever in doubt, look at the source code and specifications for detailed explanations.
218
219#![cfg_attr(test, feature(test))] // lib stability features as per RFC #507
220
221extern crate encoding_index_japanese as index_japanese;
222extern crate encoding_index_korean as index_korean;
223extern crate encoding_index_simpchinese as index_simpchinese;
224extern crate encoding_index_singlebyte as index_singlebyte;
225extern crate encoding_index_tradchinese as index_tradchinese;
226extern crate encoding_types;
227
228#[cfg(test)]
229extern crate test;
230
231#[cfg(doctest)]
232mod test_readme {
233 macro_rules! external_doc_test {
234 ($x:expr) => {
235 #[doc = $x]
236 extern "C" {}
237 };
238 }
239
240 external_doc_test!(include_str!("../README.md"));
241}
242
243pub use self::types::{
244 ByteWriter, CodecError, DecoderTrap, DecoderTrapFunc, EncoderTrap, EncoderTrapFunc, Encoding,
245 EncodingRef, RawDecoder, RawEncoder, StringWriter,
246}; // reexport
247use std::borrow::Cow;
248
249#[macro_use]
250mod util;
251#[cfg(test)]
252#[macro_use]
253mod testutils;
254
255pub mod types;
256
257/// Codec implementations.
258pub mod codec {
259 pub mod ascii;
260 pub mod error;
261 pub mod japanese;
262 pub mod korean;
263 pub mod simpchinese;
264 pub mod singlebyte;
265 pub mod tradchinese;
266 pub mod utf_16;
267 pub mod utf_8;
268 pub mod whatwg;
269}
270
271pub mod all;
272pub mod label;
273
274/// Determine the encoding by looking for a Byte Order Mark (BOM)
275/// and decoded a single string in memory.
276/// Return the result and the used encoding.
277pub fn decode(
278 input: &[u8],
279 trap: DecoderTrap,
280 fallback_encoding: EncodingRef,
281) -> (Result<String, Cow<'static, str>>, EncodingRef) {
282 use crate::all::{UTF_16BE, UTF_16LE, UTF_8};
283 if input.starts_with(&[0xEF, 0xBB, 0xBF]) {
284 (UTF_8.decode(&input[3..], trap), UTF_8 as EncodingRef)
285 } else if input.starts_with(&[0xFE, 0xFF]) {
286 (UTF_16BE.decode(&input[2..], trap), UTF_16BE as EncodingRef)
287 } else if input.starts_with(&[0xFF, 0xFE]) {
288 (UTF_16LE.decode(&input[2..], trap), UTF_16LE as EncodingRef)
289 } else {
290 (fallback_encoding.decode(input, trap), fallback_encoding)
291 }
292}
293
294#[cfg(test)]
295mod tests {
296 use super::*;
297
298 #[test]
299 fn test_decode() {
300 fn test_one(input: &[u8], expected_result: &str, expected_encoding: &str) {
301 let (result, used_encoding) =
302 decode(input, DecoderTrap::Strict, all::ISO_8859_1 as EncodingRef);
303 let result = result.unwrap();
304 assert_eq!(used_encoding.name(), expected_encoding);
305 assert_eq!(&result[..], expected_result);
306 }
307
308 test_one(&[0xEF, 0xBB, 0xBF, 0xC3, 0xA9], "é", "utf-8");
309 test_one(&[0xC3, 0xA9], "é", "iso-8859-1");
310
311 test_one(&[0xFE, 0xFF, 0x00, 0xE9], "é", "utf-16be");
312 test_one(&[0x00, 0xE9], "\x00é", "iso-8859-1");
313
314 test_one(&[0xFF, 0xFE, 0xE9, 0x00], "é", "utf-16le");
315 test_one(&[0xE9, 0x00], "é\x00", "iso-8859-1");
316 }
317}