1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#![doc = include_str!("../README.md")]
#![cfg_attr(not(feature = "std"), no_std)]
#![cfg_attr(docsrs, feature(doc_cfg))]

mod codepoints;
mod similar;
#[cfg(feature = "std")]
mod string;
#[cfg(test)]
mod test;
mod translation;
mod util;

#[cfg(feature = "std")]
pub use string::CuredString;
pub use translation::Translation;

use codepoints::{
  Codepoint, CASE_SENSITIVE_CODEPOINTS_COUNT, CASE_SENSITIVE_CODEPOINTS_OFFSET, CODEPOINTS_COUNT,
};
use core::cmp::Ordering;

const fn translate(code: u32, offset: i32, mut end: i32) -> Option<Translation> {
  let mut start = 0;

  while start <= end {
    let mid = (start + end) / 2;
    let codepoint = Codepoint::at(offset + (mid * 5));

    match codepoint.matches(code) {
      Ordering::Equal => return Some(codepoint.translation(code)),
      Ordering::Greater => start = mid + 1,
      Ordering::Less => end = mid - 1,
    };
  }

  None
}

/// Cures a single character/unicode codepoint.
///
/// # Examples
///
/// Most of the time, this would yield only a single unicode character:
///
/// ```rust
/// use decancer::Translation;
///
/// assert!(matches!(decancer::cure_char('E'), Translation::Character('e')));
/// ```
///
/// However, for several special cases, it would yield an [ASCII](https://en.wikipedia.org/wiki/ASCII) [`&'static str`][prim@str]:
///
/// ```rust
/// use decancer::Translation;
///
/// assert!(matches!(decancer::cure_char('æ'), Translation::String("ae")));
/// assert!(matches!(decancer::cure_char('ij'), Translation::String("ij")));
/// assert!(matches!(decancer::cure_char('œ'), Translation::String("oe")));
/// assert!(matches!(decancer::cure_char('🆐'), Translation::String("dj")));
/// assert!(matches!(decancer::cure_char('🆑'), Translation::String("cl")));
/// assert!(matches!(decancer::cure_char('🆔'), Translation::String("id")));
/// assert!(matches!(decancer::cure_char('🆖'), Translation::String("ng")));
/// assert!(matches!(decancer::cure_char('🆗'), Translation::String("ok")));
/// assert!(matches!(decancer::cure_char('🆚'), Translation::String("vs")));
/// assert!(matches!(decancer::cure_char('🜀'), Translation::String("qe")));
/// assert!(matches!(decancer::cure_char('🜇'), Translation::String("ar")));
///
/// assert!(matches!(decancer::cure_char('⅓'), Translation::String("1/3")));
/// assert!(matches!(decancer::cure_char('⅔'), Translation::String("2/3")));
/// assert!(matches!(decancer::cure_char('⅕'), Translation::String("1/5")));
/// assert!(matches!(decancer::cure_char('⅖'), Translation::String("2/5")));
/// assert!(matches!(decancer::cure_char('⅗'), Translation::String("3/5")));
/// assert!(matches!(decancer::cure_char('⅘'), Translation::String("4/5")));
/// assert!(matches!(decancer::cure_char('㋍'), Translation::String("erg")));
/// assert!(matches!(decancer::cure_char('㋏'), Translation::String("ltd")));
///
/// assert!(matches!(decancer::cure_char('㍴'), Translation::String("bar")));
/// assert!(matches!(decancer::cure_char('㎈'), Translation::String("cal")));
/// assert!(matches!(decancer::cure_char('㎭'), Translation::String("rad")));
/// assert!(matches!(decancer::cure_char('㏇'), Translation::String("co.")));
/// assert!(matches!(decancer::cure_char('㏒'), Translation::String("log")));
/// assert!(matches!(decancer::cure_char('㏕'), Translation::String("mil")));
/// assert!(matches!(decancer::cure_char('㏖'), Translation::String("mol")));
/// assert!(matches!(decancer::cure_char('㏙'), Translation::String("ppm")));
/// ```
///
/// If your unicode character is a [control character](https://en.wikipedia.org/wiki/Control_character), [surrogate](https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates), [combining character](https://en.wikipedia.org/wiki/Script_(Unicode)#Special_script_property_values) (e.g diacritics), [private use character](https://en.wikipedia.org/wiki/Private_Use_Areas), [byte order character](https://en.wikipedia.org/wiki/Byte_order_mark), or any invalid unicode value (e.g beyond [`char::MAX`]), you would get [`None`][Translation::None]:
///
/// ```rust
/// use decancer::Translation;
///
/// assert!(matches!(decancer::cure_char(0xD800u32), Translation::None));
/// assert!(matches!(decancer::cure_char(char::REPLACEMENT_CHARACTER), Translation::None));
/// assert!(matches!(decancer::cure_char((char::MAX as u32) + 1), Translation::None));
/// ```
pub fn cure_char<C: Into<u32>>(code: C) -> Translation {
  let code = code.into();

  if matches!(code, 0..=31 | 127 | 0xd800..=0xf8ff | 0xe0100..) {
    return Translation::None;
  }

  // SAFETY: even if there is no lowercase mapping for some codepoints, it would just return itself.
  // therefore, the first iteration and/or codepoint always exists.
  let code_lowercased = unsafe {
    char::from_u32_unchecked(code)
      .to_lowercase()
      .next()
      .unwrap_unchecked() as _
  };

  if code_lowercased < 0x80 {
    return Translation::character(code_lowercased);
  } else if code != code_lowercased {
    if let Some(translation) = translate(
      code,
      CASE_SENSITIVE_CODEPOINTS_OFFSET as _,
      CASE_SENSITIVE_CODEPOINTS_COUNT as _,
    ) {
      return translation;
    }
  }

  translate(code_lowercased, 6, CODEPOINTS_COUNT as _)
    .unwrap_or_else(|| Translation::character(code_lowercased))
}

/// Cures a string.
///
/// # Examples
///
/// Basic usage:
///
/// ```rust
/// let cured = decancer::cure("vEⓡ𝔂 𝔽𝕌Ňℕy ţ乇𝕏𝓣");
///
/// // cured here is a decancer::CuredString struct wrapping over the cured string
/// // for comparison purposes, it's more recommended to use the methods provided by the decancer::CuredString struct.
/// assert_eq!(cured, "very funny text");
/// assert!(cured.starts_with("very"));
/// assert!(cured.contains("funny"));
/// assert!(cured.ends_with("text"));
///
/// // retrieve the String inside and consume the struct.
/// let _output_str = cured.into_str();
/// ```
#[cfg(feature = "std")]
#[inline(always)]
pub fn cure(input: &str) -> CuredString {
  input.chars().map(cure_char).collect()
}