Skip to main content

deunicode/
lib.rs

1//! The `deunicode` library transliterates Unicode strings such as "Æneid" into pure
2//! ASCII ones such as "AEneid."
3//!
4//! Supports no-std. Stores Unicode data in a compact format.
5//!
6//! It started as a Rust port of [`Text::Unidecode`](http://search.cpan.org/~sburke/Text-Unidecode-1.30/lib/Text/Unidecode.pm) Perl module, and was extended to support emoji.
7//!
8//! See [README](https://github.com/kornelski/deunicode/blob/master/README.md) for more info.
9//!
10//! Examples
11//! --------
12#![cfg_attr(feature = "alloc", doc = "```rust")]
13#![cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
14//! use deunicode::deunicode;
15//!
16//! assert_eq!(deunicode("Æneid"), "AEneid");
17//! assert_eq!(deunicode("étude"), "etude");
18//! assert_eq!(deunicode("北亰"), "Bei Jing");
19//! assert_eq!(deunicode("ᔕᓇᓇ"), "shanana");
20//! assert_eq!(deunicode("げんまい茶"), "genmaiCha");
21//! assert_eq!(deunicode("🦄☣"), "unicorn biohazard");
22//! assert_eq!(deunicode("…"), "...");
23//!
24//! // format without a temporary string
25//! use deunicode::AsciiChars;
26//! format!("what's up {}", "🐶".ascii_chars());
27#![doc = "```"] // to mollify some syntax highlighters
28
29#![no_std]
30
31#[cfg(any(test, feature = "alloc"))]
32extern crate alloc;
33#[cfg(feature = "alloc")]
34use alloc::borrow::Cow;
35#[cfg(feature = "alloc")]
36use alloc::string::String;
37
38use core::iter::FusedIterator;
39use core::str::Chars;
40
41const MAPPING: &str = include_str!("mapping.txt");
42
43#[repr(C)]
44#[derive(Copy, Clone)]
45struct Ptr {
46    /// if len <= 2, it's the string itself,
47    /// otherwise it's an u16 offset into MAPPING
48    chr: [u8; 2],
49    len: u8,
50}
51
52const POINTERS_BYTES: &[u8] = include_bytes!("pointers.bin");
53/// POINTERS format is described by struct Ptr
54const POINTERS: &[Ptr] = unsafe { core::slice::from_raw_parts(POINTERS_BYTES.as_ptr().cast(), POINTERS_BYTES.len() / core::mem::size_of::<Ptr>()) };
55
56/// This function takes any Unicode string and returns an ASCII transliteration
57/// of that string.
58///
59/// Guarantees and Warnings
60/// -----------------------
61/// Here are some guarantees you have when calling [`deunicode()`]:
62///   * The `String` returned will be valid ASCII; the decimal representation of
63///     every `char` in the string will be between 0 and 127, inclusive.
64///   * Every ASCII character (0x0000 - 0x007F) is mapped to itself.
65///   * All Unicode characters will translate to a string containing newlines
66///     (`"\n"`) or ASCII characters in the range 0x0020 - 0x007E. So for example,
67///     no Unicode character will translate to `\u{01}`. The exception is if the
68///     ASCII character itself is passed in, in which case it will be mapped to
69///     itself. (So `'\u{01}'` will be mapped to `"\u{01}"`.)
70///
71/// There are, however, some things you should keep in mind:
72///   * As stated, some transliterations do produce `\n` characters.
73///   * Some Unicode characters transliterate to an empty string on purpose.
74///   * Some Unicode characters are unknown and transliterate to `"[?]"` (see [`deunicode_with_tofu()`])
75///   * Many Unicode characters transliterate to multi-character strings. For
76///     example, 北 is transliterated as "Bei ".
77///   * Han characters are mapped to Mandarin, and will be mostly illegible to Japanese readers.
78#[inline(always)]
79#[cfg(feature = "alloc")]
80#[must_use]
81pub fn deunicode(s: &str) -> String {
82    deunicode_with_tofu(s, "[?]")
83}
84
85/// Same as [`deunicode()`], but unknown characters can be replaced with a custom string.
86///
87/// You can use "\u{FFFD}" to use the usual Unicode Replacement Character.
88///
89/// "Tofu" is a nickname for a replacement character, which in Unicode fonts usually
90/// looks like a block of tofu.
91#[inline]
92#[cfg(feature = "alloc")]
93#[must_use]
94pub fn deunicode_with_tofu(s: &str, custom_placeholder: &str) -> String {
95    deunicode_with_tofu_cow(s, custom_placeholder).into_owned()
96}
97
98/// Same as [`deunicode_with_tofu()`], but avoids allocating a new `String` if not necessary.
99///
100/// You can use "\u{FFFD}" to use the usual Unicode Replacement Character.
101///
102/// "Tofu" is a nickname for a replacement character, which in Unicode fonts usually
103/// looks like a block of tofu.
104#[cfg(feature = "alloc")]
105#[must_use]
106pub fn deunicode_with_tofu_cow<'input>(s: &'input str, custom_placeholder: &str) -> Cow<'input, str> {
107    // Fast path to skip over ASCII chars at the beginning of the string
108    let ascii_len = s.as_bytes().iter().take_while(|&&c| c < 0x7F).count();
109    if ascii_len >= s.len() { // >= elides bounds check in split_at
110        return Cow::Borrowed(s);
111    }
112
113    let (ascii, rest) = s.as_bytes().split_at(ascii_len);
114    // safe, because it's been checked to be ASCII only
115    debug_assert!(core::str::from_utf8(ascii).is_ok());
116    let ascii = unsafe { core::str::from_utf8_unchecked(ascii) };
117
118    // reserve a bit more space to avoid reallocations on longer transliterations
119    // but instead of `+ 16` uses `| 15` to stay in the smallest allocation bucket for short strings
120    let mut out = String::new();
121    // this generates less code than with_capacity()
122    out.try_reserve_exact(s.len() | 15).unwrap_or_else(|_| panic!());
123
124    // this if optimizes out unused realloc code from push_str
125    let needs_to_grow = ascii.as_bytes().len() > out.capacity().wrapping_sub(out.len());
126    if !needs_to_grow {
127        out.push_str(ascii);
128    }
129
130    // safe, because UTF-8 codepoint can't start with < 7F byte
131    debug_assert!(core::str::from_utf8(rest).is_ok());
132    let s = unsafe { core::str::from_utf8_unchecked(rest) };
133
134    out.extend(s.ascii_chars().map(move |ch| ch.unwrap_or(custom_placeholder)));
135    Cow::Owned(out)
136}
137
138/// This function takes a single Unicode character and returns an ASCII
139/// transliteration.
140///
141/// The warnings and guarantees of [`deunicode()`] apply to this function as well.
142///
143/// Examples
144/// --------
145/// ```rust
146/// # use deunicode::deunicode_char;
147/// assert_eq!(deunicode_char('Æ'), Some("AE"));
148/// assert_eq!(deunicode_char('北'), Some("Bei "));
149/// ```
150#[inline]
151#[must_use]
152pub fn deunicode_char(ch: char) -> Option<&'static str> {
153    if let Some(p) = POINTERS.get(ch as usize) {
154        // if length is 1 or 2, then the "pointer" data is used to store the char
155        if p.len <= 2 {
156            let chars = p.chr.get(..p.len as usize)?;
157            // safe, because we're returning only ASCII
158            debug_assert!(core::str::from_utf8(chars).is_ok());
159            unsafe {
160                Some(core::str::from_utf8_unchecked(chars))
161            }
162        } else {
163            let map_pos = (u16::from(p.chr[0]) | u16::from(p.chr[1]) << 8) as usize;
164            // unknown characters are intentionally mapped to out of range length
165            MAPPING.get(map_pos..map_pos + p.len as usize)
166        }
167    } else {
168        None
169    }
170}
171
172/// Convenience functions for deunicode. `use deunicode::AsciiChars`
173pub trait AsciiChars {
174    /// Iterate over Unicode characters converted to ASCII sequences.
175    ///
176    /// Items of this iterator may be `None` for some characters.
177    /// Use `.map(|ch| ch.unwrap_or("?"))` to replace invalid characters.
178    ///
179    /// Alternatively, this iterator can be used in formatters:
180    #[cfg_attr(feature = "alloc", doc = "```rust")]
181    #[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
182    /// use deunicode::AsciiChars;
183    /// format!("what's up {}", "🐶".ascii_chars());
184    #[doc = "```"]
185    fn ascii_chars(&self) -> AsciiCharsIter<'_>;
186
187    /// Convert any Unicode string to ASCII-only string.
188    ///
189    /// Characters are converted to closest ASCII equivalent.
190    /// Characters that can't be converted are replaced with `"[?]"`.
191    #[cfg(feature = "alloc")]
192    fn to_ascii_lossy(&self) -> String;
193}
194
195#[cfg(feature = "alloc")]
196impl AsciiChars for String {
197    #[inline(always)]
198    fn ascii_chars(&self) -> AsciiCharsIter<'_> {
199        AsciiCharsIter::new(self)
200    }
201    #[inline(always)]
202    fn to_ascii_lossy(&self) -> String {
203        deunicode(self)
204    }
205}
206
207impl AsciiChars for str {
208    #[inline(always)]
209    fn ascii_chars(&self) -> AsciiCharsIter<'_> {
210        AsciiCharsIter::new(self)
211    }
212    #[inline(always)]
213    #[cfg(feature = "alloc")]
214    fn to_ascii_lossy(&self) -> String {
215        deunicode(self)
216    }
217}
218
219/// Iterator that translates Unicode characters to ASCII strings.
220///
221/// See [`AsciiChars`] trait's `str.ascii_chars()` method.
222///
223/// Additionally, it implements `Display` for formatting strings without allocations.
224///
225#[cfg_attr(feature = "alloc", doc = "```rust")]
226#[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
227/// use deunicode::AsciiChars;
228/// format!("what's up {}", "🐶".ascii_chars());
229#[doc = "```"]
230#[derive(Clone)]
231pub struct AsciiCharsIter<'a> {
232    next_char: Option<Option<&'static str>>,
233    chars: Chars<'a>,
234}
235
236/// Use `.map(|ch| ch.unwrap_or("?"))` to replace invalid characters.
237impl<'a> AsciiCharsIter<'a> {
238    #[inline]
239    pub fn new(unicode_string: &'a str) -> Self {
240        let mut chars = unicode_string.chars();
241        Self {
242            next_char: chars.next().map(deunicode_char),
243            chars,
244        }
245    }
246}
247
248impl<'a> FusedIterator for AsciiCharsIter<'a> {}
249
250impl<'a> Iterator for AsciiCharsIter<'a> {
251    type Item = Option<&'static str>;
252
253    #[inline]
254    fn next(&mut self) -> Option<Self::Item> {
255        let dch = self.next_char?;
256        self.next_char = self.chars.next().map(deunicode_char);
257        let dch = match dch {
258            None => return Some(None),
259            Some(dch) => dch,
260        };
261        // ends with space
262        let trim_last_char = dch.as_bytes().len() > 1 && dch.as_bytes().last().copied() == Some(b' ') &&
263            self.next_char.map_or(true, |ch| { // true if end
264            ch.map_or(false, |ch| ch.as_bytes().first().copied() == Some(b' ')) // space next (assume placeholder is not space)
265        });
266        Some(if !trim_last_char {
267            Some(dch)
268        } else {
269            dch.get(..dch.len()-1)
270        })
271    }
272
273    #[inline]
274    fn count(self) -> usize {
275        self.chars.count() + if self.next_char.is_some() {1} else {0}
276    }
277
278    #[inline]
279    fn size_hint(&self) -> (usize, Option<usize>) {
280        (self.chars.size_hint().0 + if self.next_char.is_some() {1} else {0}, None)
281    }
282}
283
284/// Format without a temporary string
285///
286#[cfg_attr(feature = "alloc", doc = "```rust")]
287#[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
288/// use deunicode::AsciiChars;
289/// format!("what's up {}", "🐶".ascii_chars());
290#[doc = "```"]
291impl core::fmt::Display for AsciiCharsIter<'_> {
292    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
293        self.clone().try_for_each(|ch| f.write_str(ch.unwrap_or("\u{FFFD}")))
294    }
295}
296
297#[test]
298fn iter_test() {
299    use alloc::vec::Vec;
300    let chars: Vec<_> = AsciiCharsIter::new("🄏中国").flatten().collect();
301    assert_eq!(&chars, &["NonCommercial", "Zhong ", "Guo"]);
302    let chars: Vec<_> = "中国x🅶".ascii_chars().flatten().collect();
303    assert_eq!(&chars, &["Zhong ", "Guo ", "x", "G"]);
304    let chars: Vec<_> = "☃中 国".ascii_chars().flatten().collect();
305    assert_eq!(&chars, &["snowman ", "Zhong", " ", "Guo"]);
306}
307
308#[test]
309fn zalgo() {
310    assert_eq!(deunicode_with_tofu("h̵̡̢̛̻̬͔̦͓̥̞̳͇̭̣̪̰̞̲̩̭̤͚͖͓̰̭̝̬̖̭͇͇̰͇͓̠͑͆͐͛̏͒͆̊́̊̂̉̉̈́̿̆̾̌̀͒͌́͗͋͜͝͝͝ͅĕ̷̡̧̡̧̜̮͙̗͙͕͖̩͈͙̞̞̭͙̯͖̰͖̙̹͖͚̦̬̄̀̓̈́͗̆̓̽͛̀͛̄͂̉͒̓̐̃̑́͊̀͋͊͗́̈́͑͗̐̔̈͊͋̓͊̓́̏̍̍̓͘̕͝͝͠ͅl̶̠̮̺̦̩͓̣̪͚͌̊̈́̀̄̈́̉͗̀̏͋̆̈̈́̉̋̊̉̉̌̈́̚̕͠͠l̴̨̡͍͇̝̟̩̙̤̰̬̬͖͙̺̟̯͓̥̯͔̤̠̻̤̮̘̋͑̑̿͗͂̃̓̓̉͒̑͜͠ͅo̸̢̧̨̜͉̜͓͙̰̳̙̖̰͇̺͈̝̬̩̫͛̅̍͌̎̅̿̂̚̕͜ ̵̛̗͍̊̈͋̀̊͒̄̔̔͋͋̆͋̅̀͂͂̍́̀̈́̈́͂̂̂̆̅͗̄̈́̀̈́̅̒̈̋͊̍̈́͂̑̓̽̂̂̓̚̕̚̕̚͠͝w̷̨͍͖̗͔͖͎̩̠̜͖̞͍̘̤͕̮̥̭͛̆̎̋̄͒̓̈́͆̀̆̚ǫ̷̢̢̧̧̨̧̧̨̢̼̮̺̬͇͓̪̯͖̥͙̠͍̭̩̰͎̘̺̝̲̖̮̞̝̠̠͎̻̠͙̫͙̞̫̭͖̱͉̱̮̌͑̈̅̈́̊̓͌̇͌̏̾̆͗̉͊̐̈́̾́̔̆͐́͘͜͜͝ͅŗ̵̡̛̛̟̭͉̰̮̺̜̼̰̟̲͖͔͕̰͕͇̪̲̫̬͚̱̮͎̭̩̩̉̇̉̀̉͑̔͋͆͌͜͠ļ̴̢̨̢̛͙̳̮̠͔͇͈̟͇̦̯͖̖͚̺̤͈̻͔̤̤̪̫͔͕̻̟̥̤̩͚̟̳͔̘̤͈͍͍̯̻̙̺̪̄̈́́͊̋̊́̅͛̉̊̉̅̋̆̔͑̈́͋̑͂̍̌̓̾̆̕̕͝ͅḏ̶̡̨̢̡̛̙͕̘̜͚̺̬̭̜͖͎͚̹̖͈̖̤͎̙̫͎̜̩̰̬̪̣̎͛̓̏̃͊̈́̽̆̒̈́̎̄̍́͘̚̚͝͠͠ͅ!̶̨̨̨̛̛̟̳̼̘͎͔̜͎͚̖̮̰͕̞̦̩̗̫̠͔͕͎͎͎̦̬̫̩̰̲̈́͋̽̀̒͆̄̑̐̀̐̋͆̈́̊̽̊̅̊̀͆͆͑̈͋̌͆͑̂̊͑̚͝͝ͅͅͅ", ""), "hello world!");
311}