unidecode/
lib.rs

1//! The `rust-unidecode` library is a Rust port of Sean M. Burke's famous
2//! [`Text::Unidecode`](http://search.cpan.org/~sburke/Text-Unidecode-1.23/lib/Text/Unidecode.pm)
3//! module for Perl. It transliterates Unicode strings such as "Æneid" into pure
4//! ASCII ones such as "AEneid." For a detailed explanation on the rationale
5//! behind using such a library, you can refer to both the documentation of the
6//! original module and
7//! [this article](http://interglacial.com/~sburke/tpj/as_html/tpj22.html)
8//! written by Burke in 2001.
9//!
10//! The data set used to translate the Unicode was ported directly from the
11//! `Text::Unidecode` module using a Perl script, so `rust-unidecode` should
12//! produce identical output.
13//!
14//! Examples
15//! --------
16//! ```ignore
17//! extern crate unidecode;
18//! use unidecode::unidecode;
19//!
20//! assert_eq!(unidecode("Æneid"), "AEneid");
21//! assert_eq!(unidecode("étude"), "etude");
22//! assert_eq!(unidecode("北亰"), "Bei Jing");
23//! assert_eq!(unidecode("ᔕᓇᓇ"), "shanana");
24//! assert_eq!(unidecode("げんまい茶"), "genmaiCha ");
25//! ```
26
27mod data;
28use data::MAPPING;
29
30/// This function takes any Unicode string and returns an ASCII transliteration
31/// of that string.
32///
33/// # Guarantees
34///
35/// * The `String` returned will be valid ASCII; the decimal representation of
36///   every `char` in the string will be between 0 and 127, inclusive.
37/// * Every ASCII character (0x0000 - 0x007F) is mapped to itself.
38/// * All Unicode characters will translate to a string containing newlines
39///   (`"\n"`) or ASCII characters in the range 0x0020 - 0x007E. So for example,
40///   no Unicode character will translate to `\u{01}`. The exception is if the
41///   ASCII character itself is passed in, in which case it will be mapped to
42///   itself. (So `'\u{01}'` will be mapped to `"\u{01}"`.)
43///
44/// # Warnings
45///
46/// * As stated, some transliterations do produce `\n` characters.
47/// * Some Unicode characters transliterate to an empty string, either on
48///   purpose or because `rust-unidecode` does not know about the character.
49/// * Some Unicode characters are unknown and transliterate to `"[?]"`.
50/// * Many Unicode characters transliterate to multi-character strings. For
51///   example, 北 is transliterated as "Bei ".
52///
53/// These guarantees/warnings are paraphrased from the original
54/// `Text::Unidecode` documentation.
55pub fn unidecode(s: &str) -> String {
56    s.chars().map(|ch| unidecode_char(ch)).collect()
57}
58
59/// This function takes a single Unicode character and returns an ASCII
60/// transliteration.
61///
62/// The warnings and guarantees of `unidecode()` apply to this function as well.
63///
64/// Examples
65/// --------
66/// ```ignore
67/// assert_eq!(unidecode_char('Æ'), "AE");
68/// assert_eq!(unidecode_char('北'), "Bei ");
69/// ```
70#[inline]
71pub fn unidecode_char(ch: char) -> &'static str {
72    MAPPING.get(ch as usize).map(|&s| s).unwrap_or("")
73}