deencode/
lib.rs

1//! # Deencode: Reverse engineer encoding errors
2//!
3//! The goal of this crate is to automatically explore the result of
4//! successively encoding then decoding a string using different encoding
5//! schemes, which usually results in some corruption of the non-ASCII
6//! characters.
7//!
8//! ## Concepts
9//!
10//! * [Engines](engine/trait.Engine.html) are objects that represent an encoding
11//!   scheme, and can be used to encode (String to bytes) or decode (bytes to
12//!   String). A number of engines are already implemented into this crate, with
13//!   static instances if you want to use them.
14//! * The structure of deencoding is a
15//!   [tree](deencodetree/struct.DeencodeTree.html): from an input string, every
16//!   engine may give an encoding, then every engine gives a decoding of that
17//!   encoding, and so on.
18//!
19//! > _Note_: The deencoding process is not optimised to avoid doing the same
20//! > steps over and over. It is recommended to keep the depth to small numbers.
21//! > Deduplication can then be applied to remove duplication in the tree.
22//!
23//! ## Usage
24//!
25//! ```rust
26//! use deencode::*;
27//!
28//! // List the engines to use.
29//! let engines: Vec<&dyn Engine> = vec![&UTF8, &LATIN1, &MIXED816BE, &MIXED816LE, &UTF7];
30//! // Explore the tree of possible encodings and decodings.
31//! let mut tree = deencode("Clément", &engines, 1);
32//! // Remove duplicate entries from the tree.
33//! let _ = tree.deduplicate();
34//!
35//! // Export the tree with box drawings.
36//! println!("{}", tree);
37//! // Export the tree as JSON.
38//! println!("{}", serde_json::to_string(&tree).unwrap());
39//! ```
40
41pub mod deencodetree;
42pub mod engine;
43pub mod latin1engine;
44pub mod mixed816beengine;
45pub mod mixed816leengine;
46pub mod utf7engine;
47pub mod utf8engine;
48
49pub use engine::Engine;
50pub use deencodetree::DeencodeTree;
51
52/// Provided engine for Latin-1 / ISO-8859-1 / Codepage 1252.
53pub static LATIN1: latin1engine::Latin1Engine = latin1engine::Latin1Engine {};
54/// Provided engine for a mixed UTF-8/UTF-16BE scheme.
55pub static MIXED816BE: mixed816beengine::Mixed816BEEngine =
56    mixed816beengine::Mixed816BEEngine {};
57/// Provided engine for a mixed UTF-8/UTF-16LE scheme.
58pub static MIXED816LE: mixed816leengine::Mixed816LEEngine =
59    mixed816leengine::Mixed816LEEngine {};
60/// Provided engine for UTF-7.
61pub static UTF7: utf7engine::Utf7Engine = utf7engine::Utf7Engine {};
62/// Provided engine for UTF-8.
63pub static UTF8: utf8engine::Utf8Engine = utf8engine::Utf8Engine {};
64
65/// Build a [`DeencodeTree`] by successively running encodings and decodings
66/// through the engines.
67///
68/// Alias of [`DeencodeTree::deencode()`].
69///
70/// `encoding_depth` specifies the number of _encoding_ steps, which are always
71/// followed by a decoding step, so the actual depth of the generated tree is
72/// `2 * encoding_depth`.
73///
74/// The process starts with encoding, so you may not have `depth == 0`. (see
75/// [`EncodeNode::make_nodes()`](deencodetree::EncodeNode::make_nodes)'s
76/// documentation)
77///
78/// The order of the engines matters for [`DeencodeTree::deduplicate()`].
79pub fn deencode(input: &str, engines: &[&dyn Engine], encoding_depth: usize)
80    -> DeencodeTree
81{
82    DeencodeTree::deencode(input, engines, encoding_depth)
83}