1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
//! The `deunicode` library transliterates Unicode strings such as "Æneid" into pure
//! ASCII ones such as "AEneid."
//!
//! Supports no-std. Stores Unicode data in a compact format.
//!
//! It started as a Rust port of [`Text::Unidecode`](http://search.cpan.org/~sburke/Text-Unidecode-1.30/lib/Text/Unidecode.pm) Perl module, and was extended to support emoji.
//!
//! See [README](https://github.com/kornelski/deunicode/blob/master/README.md) for more info.
//!
//! Examples
//! --------
#![cfg_attr(feature = "alloc", doc = "```rust")]
#![cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
//! use deunicode::deunicode;
//!
//! assert_eq!(deunicode("Æneid"), "AEneid");
//! assert_eq!(deunicode("étude"), "etude");
//! assert_eq!(deunicode("北亰"), "Bei Jing");
//! assert_eq!(deunicode("ᔕᓇᓇ"), "shanana");
//! assert_eq!(deunicode("げんまい茶"), "genmaiCha");
//! assert_eq!(deunicode("🦄☣"), "unicorn biohazard");
//! assert_eq!(deunicode("…"), "...");
//!
//! // format without a temporary string
//! use deunicode::AsciiChars;
//! format!("what's up {}", "🐶".ascii_chars());
#![doc = "```"] // to mollify some syntax highlighters

#![no_std]

#[cfg(any(test, feature = "alloc"))]
extern crate alloc;
#[cfg(feature = "alloc")]
use alloc::borrow::Cow;
#[cfg(feature = "alloc")]
use alloc::string::String;

use core::iter::FusedIterator;
use core::str::Chars;

const MAPPING: &str = include_str!("mapping.txt");

#[repr(C)]
#[derive(Copy, Clone)]
struct Ptr {
    /// if len <= 2, it's the string itself,
    /// otherwise it's an u16 offset into MAPPING
    chr: [u8; 2],
    len: u8,
}

const POINTERS_BYTES: &[u8] = include_bytes!("pointers.bin");
/// POINTERS format is described by struct Ptr
const POINTERS: &[Ptr] = unsafe { core::slice::from_raw_parts(POINTERS_BYTES.as_ptr().cast(), POINTERS_BYTES.len() / core::mem::size_of::<Ptr>()) };

/// This function takes any Unicode string and returns an ASCII transliteration
/// of that string.
///
/// Guarantees and Warnings
/// -----------------------
/// Here are some guarantees you have when calling [`deunicode()`]:
///   * The `String` returned will be valid ASCII; the decimal representation of
///     every `char` in the string will be between 0 and 127, inclusive.
///   * Every ASCII character (0x0000 - 0x007F) is mapped to itself.
///   * All Unicode characters will translate to a string containing newlines
///     (`"\n"`) or ASCII characters in the range 0x0020 - 0x007E. So for example,
///     no Unicode character will translate to `\u{01}`. The exception is if the
///     ASCII character itself is passed in, in which case it will be mapped to
///     itself. (So `'\u{01}'` will be mapped to `"\u{01}"`.)
///
/// There are, however, some things you should keep in mind:
///   * As stated, some transliterations do produce `\n` characters.
///   * Some Unicode characters transliterate to an empty string on purpose.
///   * Some Unicode characters are unknown and transliterate to `"[?]"` (see [`deunicode_with_tofu()`])
///   * Many Unicode characters transliterate to multi-character strings. For
///     example, 北 is transliterated as "Bei ".
///   * Han characters are mapped to Mandarin, and will be mostly illegible to Japanese readers.
#[inline(always)]
#[cfg(feature = "alloc")]
#[must_use]
pub fn deunicode(s: &str) -> String {
    deunicode_with_tofu(s, "[?]")
}

/// Same as [`deunicode()`], but unknown characters can be replaced with a custom string.
///
/// You can use "\u{FFFD}" to use the usual Unicode Replacement Character.
///
/// "Tofu" is a nickname for a replacement character, which in Unicode fonts usually
/// looks like a block of tofu.
#[inline]
#[cfg(feature = "alloc")]
#[must_use]
pub fn deunicode_with_tofu(s: &str, custom_placeholder: &str) -> String {
    deunicode_with_tofu_cow(s, custom_placeholder).into_owned()
}

/// Same as [`deunicode_with_tofu()`], but avoids allocating a new `String` if not necessary.
///
/// You can use "\u{FFFD}" to use the usual Unicode Replacement Character.
///
/// "Tofu" is a nickname for a replacement character, which in Unicode fonts usually
/// looks like a block of tofu.
#[cfg(feature = "alloc")]
#[must_use]
pub fn deunicode_with_tofu_cow<'input>(s: &'input str, custom_placeholder: &str) -> Cow<'input, str> {
    // Fast path to skip over ASCII chars at the beginning of the string
    let ascii_len = s.as_bytes().iter().take_while(|&&c| c < 0x7F).count();
    if ascii_len >= s.len() { // >= elides bounds check in split_at
        return Cow::Borrowed(s);
    }

    let (ascii, rest) = s.as_bytes().split_at(ascii_len);
    // safe, because it's been checked to be ASCII only
    debug_assert!(core::str::from_utf8(ascii).is_ok());
    let ascii = unsafe { core::str::from_utf8_unchecked(ascii) };

    // reserve a bit more space to avoid reallocations on longer transliterations
    // but instead of `+ 16` uses `| 15` to stay in the smallest allocation bucket for short strings
    let mut out = String::new();
    // this generates less code than with_capacity()
    out.try_reserve_exact(s.len() | 15).unwrap_or_else(|_| panic!());

    // this if optimizes out unused realloc code from push_str
    let needs_to_grow = ascii.as_bytes().len() > out.capacity().wrapping_sub(out.len());
    if !needs_to_grow {
        out.push_str(ascii);
    }

    // safe, because UTF-8 codepoint can't start with < 7F byte
    debug_assert!(core::str::from_utf8(rest).is_ok());
    let s = unsafe { core::str::from_utf8_unchecked(rest) };

    out.extend(s.ascii_chars().map(move |ch| ch.unwrap_or(custom_placeholder)));
    Cow::Owned(out)
}

/// This function takes a single Unicode character and returns an ASCII
/// transliteration.
///
/// The warnings and guarantees of [`deunicode()`] apply to this function as well.
///
/// Examples
/// --------
/// ```rust
/// # use deunicode::deunicode_char;
/// assert_eq!(deunicode_char('Æ'), Some("AE"));
/// assert_eq!(deunicode_char('北'), Some("Bei "));
/// ```
#[inline]
#[must_use]
pub fn deunicode_char(ch: char) -> Option<&'static str> {
    if let Some(p) = POINTERS.get(ch as usize) {
        // if length is 1 or 2, then the "pointer" data is used to store the char
        if p.len <= 2 {
            let chars = p.chr.get(..p.len as usize)?;
            // safe, because we're returning only ASCII
            debug_assert!(core::str::from_utf8(chars).is_ok());
            unsafe {
                Some(core::str::from_utf8_unchecked(chars))
            }
        } else {
            let map_pos = (u16::from(p.chr[0]) | u16::from(p.chr[1]) << 8) as usize;
            // unknown characters are intentionally mapped to out of range length
            MAPPING.get(map_pos..map_pos + p.len as usize)
        }
    } else {
        None
    }
}

/// Convenience functions for deunicode. `use deunicode::AsciiChars`
pub trait AsciiChars {
    /// Iterate over Unicode characters converted to ASCII sequences.
    ///
    /// Items of this iterator may be `None` for some characters.
    /// Use `.map(|ch| ch.unwrap_or("?"))` to replace invalid characters.
    ///
    /// Alternatively, this iterator can be used in formatters:
    #[cfg_attr(feature = "alloc", doc = "```rust")]
    #[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
    /// use deunicode::AsciiChars;
    /// format!("what's up {}", "🐶".ascii_chars());
    #[doc = "```"]
    fn ascii_chars(&self) -> AsciiCharsIter<'_>;

    /// Convert any Unicode string to ASCII-only string.
    ///
    /// Characters are converted to closest ASCII equivalent.
    /// Characters that can't be converted are replaced with `"[?]"`.
    #[cfg(feature = "alloc")]
    fn to_ascii_lossy(&self) -> String;
}

#[cfg(feature = "alloc")]
impl AsciiChars for String {
    #[inline(always)]
    fn ascii_chars(&self) -> AsciiCharsIter<'_> {
        AsciiCharsIter::new(self)
    }
    #[inline(always)]
    fn to_ascii_lossy(&self) -> String {
        deunicode(self)
    }
}

impl AsciiChars for str {
    #[inline(always)]
    fn ascii_chars(&self) -> AsciiCharsIter<'_> {
        AsciiCharsIter::new(self)
    }
    #[inline(always)]
    #[cfg(feature = "alloc")]
    fn to_ascii_lossy(&self) -> String {
        deunicode(self)
    }
}

/// Iterator that translates Unicode characters to ASCII strings.
///
/// See [`AsciiChars`] trait's `str.ascii_chars()` method.
///
/// Additionally, it implements `Display` for formatting strings without allocations.
///
#[cfg_attr(feature = "alloc", doc = "```rust")]
#[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
/// use deunicode::AsciiChars;
/// format!("what's up {}", "🐶".ascii_chars());
#[doc = "```"]
#[derive(Clone)]
pub struct AsciiCharsIter<'a> {
    next_char: Option<Option<&'static str>>,
    chars: Chars<'a>,
}

/// Use `.map(|ch| ch.unwrap_or("?"))` to replace invalid characters.
impl<'a> AsciiCharsIter<'a> {
    #[inline]
    pub fn new(unicode_string: &'a str) -> Self {
        let mut chars = unicode_string.chars();
        Self {
            next_char: chars.next().map(deunicode_char),
            chars,
        }
    }
}

impl<'a> FusedIterator for AsciiCharsIter<'a> {}

impl<'a> Iterator for AsciiCharsIter<'a> {
    type Item = Option<&'static str>;

    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
        let dch = self.next_char?;
        self.next_char = self.chars.next().map(deunicode_char);
        let dch = match dch {
            None => return Some(None),
            Some(dch) => dch,
        };
        // ends with space
        let trim_last_char = dch.as_bytes().len() > 1 && dch.as_bytes().last().copied() == Some(b' ') &&
            self.next_char.map_or(true, |ch| { // true if end
            ch.map_or(false, |ch| ch.as_bytes().first().copied() == Some(b' ')) // space next (assume placeholder is not space)
        });
        Some(if !trim_last_char {
            Some(dch)
        } else {
            dch.get(..dch.len()-1)
        })
    }

    #[inline]
    fn count(self) -> usize {
        self.chars.count() + if self.next_char.is_some() {1} else {0}
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        (self.chars.size_hint().0 + if self.next_char.is_some() {1} else {0}, None)
    }
}

/// Format without a temporary string
///
#[cfg_attr(feature = "alloc", doc = "```rust")]
#[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
/// use deunicode::AsciiChars;
/// format!("what's up {}", "🐶".ascii_chars());
#[doc = "```"]
impl core::fmt::Display for AsciiCharsIter<'_> {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        self.clone().try_for_each(|ch| f.write_str(ch.unwrap_or("\u{FFFD}")))
    }
}

#[test]
fn iter_test() {
    use alloc::vec::Vec;
    let chars: Vec<_> = AsciiCharsIter::new("中国").flatten().collect();
    assert_eq!(&chars, &["Zhong ", "Guo"]);
    let chars: Vec<_> = "中国x".ascii_chars().flatten().collect();
    assert_eq!(&chars, &["Zhong ", "Guo ", "x"]);
    let chars: Vec<_> = "中 国".ascii_chars().flatten().collect();
    assert_eq!(&chars, &["Zhong", " ", "Guo"]);
}