1#![cfg_attr(feature = "alloc", doc = "```rust")]
13#![cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
14#![doc = "```"] #![no_std]
30
31#[cfg(any(test, feature = "alloc"))]
32extern crate alloc;
33#[cfg(feature = "alloc")]
34use alloc::borrow::Cow;
35#[cfg(feature = "alloc")]
36use alloc::string::String;
37
38use core::iter::FusedIterator;
39use core::str::Chars;
40
41const MAPPING: &str = include_str!("mapping.txt");
42
43#[repr(C)]
44#[derive(Copy, Clone)]
45struct Ptr {
46 chr: [u8; 2],
49 len: u8,
50}
51
52const POINTERS_BYTES: &[u8] = include_bytes!("pointers.bin");
53const POINTERS: &[Ptr] = unsafe { core::slice::from_raw_parts(POINTERS_BYTES.as_ptr().cast(), POINTERS_BYTES.len() / core::mem::size_of::<Ptr>()) };
55
56#[inline(always)]
79#[cfg(feature = "alloc")]
80#[must_use]
81pub fn deunicode(s: &str) -> String {
82 deunicode_with_tofu(s, "[?]")
83}
84
85#[inline]
92#[cfg(feature = "alloc")]
93#[must_use]
94pub fn deunicode_with_tofu(s: &str, custom_placeholder: &str) -> String {
95 deunicode_with_tofu_cow(s, custom_placeholder).into_owned()
96}
97
98#[cfg(feature = "alloc")]
105#[must_use]
106pub fn deunicode_with_tofu_cow<'input>(s: &'input str, custom_placeholder: &str) -> Cow<'input, str> {
107 let ascii_len = s.as_bytes().iter().take_while(|&&c| c < 0x7F).count();
109 if ascii_len >= s.len() { return Cow::Borrowed(s);
111 }
112
113 let (ascii, rest) = s.as_bytes().split_at(ascii_len);
114 debug_assert!(core::str::from_utf8(ascii).is_ok());
116 let ascii = unsafe { core::str::from_utf8_unchecked(ascii) };
117
118 let mut out = String::new();
121 out.try_reserve_exact(s.len() | 15).unwrap_or_else(|_| panic!());
123
124 let needs_to_grow = ascii.as_bytes().len() > out.capacity().wrapping_sub(out.len());
126 if !needs_to_grow {
127 out.push_str(ascii);
128 }
129
130 debug_assert!(core::str::from_utf8(rest).is_ok());
132 let s = unsafe { core::str::from_utf8_unchecked(rest) };
133
134 out.extend(s.ascii_chars().map(move |ch| ch.unwrap_or(custom_placeholder)));
135 Cow::Owned(out)
136}
137
138#[inline]
151#[must_use]
152pub fn deunicode_char(ch: char) -> Option<&'static str> {
153 if let Some(p) = POINTERS.get(ch as usize) {
154 if p.len <= 2 {
156 let chars = p.chr.get(..p.len as usize)?;
157 debug_assert!(core::str::from_utf8(chars).is_ok());
159 unsafe {
160 Some(core::str::from_utf8_unchecked(chars))
161 }
162 } else {
163 let map_pos = (u16::from(p.chr[0]) | u16::from(p.chr[1]) << 8) as usize;
164 MAPPING.get(map_pos..map_pos + p.len as usize)
166 }
167 } else {
168 None
169 }
170}
171
172pub trait AsciiChars {
174 #[cfg_attr(feature = "alloc", doc = "```rust")]
181 #[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
182 #[doc = "```"]
185 fn ascii_chars(&self) -> AsciiCharsIter<'_>;
186
187 #[cfg(feature = "alloc")]
192 fn to_ascii_lossy(&self) -> String;
193}
194
195#[cfg(feature = "alloc")]
196impl AsciiChars for String {
197 #[inline(always)]
198 fn ascii_chars(&self) -> AsciiCharsIter<'_> {
199 AsciiCharsIter::new(self)
200 }
201 #[inline(always)]
202 fn to_ascii_lossy(&self) -> String {
203 deunicode(self)
204 }
205}
206
207impl AsciiChars for str {
208 #[inline(always)]
209 fn ascii_chars(&self) -> AsciiCharsIter<'_> {
210 AsciiCharsIter::new(self)
211 }
212 #[inline(always)]
213 #[cfg(feature = "alloc")]
214 fn to_ascii_lossy(&self) -> String {
215 deunicode(self)
216 }
217}
218
219#[cfg_attr(feature = "alloc", doc = "```rust")]
226#[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
227#[doc = "```"]
230#[derive(Clone)]
231pub struct AsciiCharsIter<'a> {
232 next_char: Option<Option<&'static str>>,
233 chars: Chars<'a>,
234}
235
236impl<'a> AsciiCharsIter<'a> {
238 #[inline]
239 pub fn new(unicode_string: &'a str) -> Self {
240 let mut chars = unicode_string.chars();
241 Self {
242 next_char: chars.next().map(deunicode_char),
243 chars,
244 }
245 }
246}
247
248impl<'a> FusedIterator for AsciiCharsIter<'a> {}
249
250impl<'a> Iterator for AsciiCharsIter<'a> {
251 type Item = Option<&'static str>;
252
253 #[inline]
254 fn next(&mut self) -> Option<Self::Item> {
255 let dch = self.next_char?;
256 self.next_char = self.chars.next().map(deunicode_char);
257 let dch = match dch {
258 None => return Some(None),
259 Some(dch) => dch,
260 };
261 let trim_last_char = dch.as_bytes().len() > 1 && dch.as_bytes().last().copied() == Some(b' ') &&
263 self.next_char.map_or(true, |ch| { ch.map_or(false, |ch| ch.as_bytes().first().copied() == Some(b' ')) });
266 Some(if !trim_last_char {
267 Some(dch)
268 } else {
269 dch.get(..dch.len()-1)
270 })
271 }
272
273 #[inline]
274 fn count(self) -> usize {
275 self.chars.count() + if self.next_char.is_some() {1} else {0}
276 }
277
278 #[inline]
279 fn size_hint(&self) -> (usize, Option<usize>) {
280 (self.chars.size_hint().0 + if self.next_char.is_some() {1} else {0}, None)
281 }
282}
283
284#[cfg_attr(feature = "alloc", doc = "```rust")]
287#[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
288#[doc = "```"]
291impl core::fmt::Display for AsciiCharsIter<'_> {
292 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
293 self.clone().try_for_each(|ch| f.write_str(ch.unwrap_or("\u{FFFD}")))
294 }
295}
296
297#[test]
298fn iter_test() {
299 use alloc::vec::Vec;
300 let chars: Vec<_> = AsciiCharsIter::new("🄏中国").flatten().collect();
301 assert_eq!(&chars, &["NonCommercial", "Zhong ", "Guo"]);
302 let chars: Vec<_> = "中国x🅶".ascii_chars().flatten().collect();
303 assert_eq!(&chars, &["Zhong ", "Guo ", "x", "G"]);
304 let chars: Vec<_> = "☃中 国".ascii_chars().flatten().collect();
305 assert_eq!(&chars, &["snowman ", "Zhong", " ", "Guo"]);
306}
307
308#[test]
309fn zalgo() {
310 assert_eq!(deunicode_with_tofu("h̵̡̢̛̻̬͔̦͓̥̞̳͇̭̣̪̰̞̲̩̭̤͚͖͓̰̭̝̬̖̭͇͇̰͇͓̠͑͆͐͛̏͒͆̊́̊̂̉̉̈́̿̆̾̌̀͒͌́͗͋͜͝͝͝ͅĕ̷̡̧̡̧̜̮͙̗͙͕͖̩͈͙̞̞̭͙̯͖̰͖̙̹͖͚̦̬̄̀̓̈́͗̆̓̽͛̀͛̄͂̉͒̓̐̃̑́͊̀͋͊͗́̈́͑͗̐̔̈͊͋̓͊̓́̏̍̍̓͘̕͝͝͠ͅl̶̠̮̺̦̩͓̣̪͚͌̊̈́̀̄̈́̉͗̀̏͋̆̈̈́̉̋̊̉̉̌̈́̚̕͠͠l̴̨̡͍͇̝̟̩̙̤̰̬̬͖͙̺̟̯͓̥̯͔̤̠̻̤̮̘̋͑̑̿͗͂̃̓̓̉͒̑͜͠ͅo̸̢̧̨̜͉̜͓͙̰̳̙̖̰͇̺͈̝̬̩̫͛̅̍͌̎̅̿̂̚̕͜ ̵̛̗͍̊̈͋̀̊͒̄̔̔͋͋̆͋̅̀͂͂̍́̀̈́̈́͂̂̂̆̅͗̄̈́̀̈́̅̒̈̋͊̍̈́͂̑̓̽̂̂̓̚̕̚̕̚͠͝w̷̨͍͖̗͔͖͎̩̠̜͖̞͍̘̤͕̮̥̭͛̆̎̋̄͒̓̈́͆̀̆̚ǫ̷̢̢̧̧̨̧̧̨̢̼̮̺̬͇͓̪̯͖̥͙̠͍̭̩̰͎̘̺̝̲̖̮̞̝̠̠͎̻̠͙̫͙̞̫̭͖̱͉̱̮̌͑̈̅̈́̊̓͌̇͌̏̾̆͗̉͊̐̈́̾́̔̆͐́͘͜͜͝ͅŗ̵̡̛̛̟̭͉̰̮̺̜̼̰̟̲͖͔͕̰͕͇̪̲̫̬͚̱̮͎̭̩̩̉̇̉̀̉͑̔͋͆͌͜͠ļ̴̢̨̢̛͙̳̮̠͔͇͈̟͇̦̯͖̖͚̺̤͈̻͔̤̤̪̫͔͕̻̟̥̤̩͚̟̳͔̘̤͈͍͍̯̻̙̺̪̄̈́́͊̋̊́̅͛̉̊̉̅̋̆̔͑̈́͋̑͂̍̌̓̾̆̕̕͝ͅḏ̶̡̨̢̡̛̙͕̘̜͚̺̬̭̜͖͎͚̹̖͈̖̤͎̙̫͎̜̩̰̬̪̣̎͛̓̏̃͊̈́̽̆̒̈́̎̄̍́͘̚̚͝͠͠ͅ!̶̨̨̨̛̛̟̳̼̘͎͔̜͎͚̖̮̰͕̞̦̩̗̫̠͔͕͎͎͎̦̬̫̩̰̲̈́͋̽̀̒͆̄̑̐̀̐̋͆̈́̊̽̊̅̊̀͆͆͑̈͋̌͆͑̂̊͑̚͝͝ͅͅͅ", ""), "hello world!");
311}