decancer/
lib.rs

1#![doc = include_str!("../README.md")]
2#![allow(clippy::upper_case_acronyms)]
3#![cfg_attr(docsrs, feature(doc_cfg))]
4#![forbid(unsafe_code, rustdoc::broken_intra_doc_links)]
5
6mod bidi;
7mod codepoints;
8#[cfg(feature = "leetspeak")]
9mod leetspeak;
10mod options;
11mod similar;
12mod string;
13#[cfg(test)]
14mod tests;
15mod translation;
16mod util;
17
18use bidi::{Class, Level, Paragraph};
19pub use options::Options;
20pub use similar::Matcher;
21pub use string::CuredString;
22pub use translation::Translation;
23
24use codepoints::{
25  CASE_SENSITIVE_CODEPOINTS_COUNT, CASE_SENSITIVE_CODEPOINTS_OFFSET, CODEPOINTS_COUNT,
26};
27
28use util::{error_enum, is_none};
29#[cfg(feature = "options")]
30use util::{is_alphanumeric, is_special_rtl};
31
32error_enum! {
33  /// An error enum for unicode bidi errors caused by malformed string inputs.
34  #[repr(u8)]
35  #[derive(Copy, Clone, Debug)]
36  pub enum Error {
37    /// Attempted to create a unicode bidi level that exceeds `MAX_EXPLICIT_DEPTH` (125).
38    LevelExplicitOverflow,
39    /// Attempted to create a unicode bidi level that exceeds `MAX_IMPLICIT_DEPTH` (126).
40    LevelImplicitOverflow,
41    /// Attempted to lower a unicode bidi level that is already zero.
42    LevelModificationUnderflow,
43    /// Attempted to raise a unicode bidi level that is already at `MAX_IMPLICIT_DEPTH` (126).
44    LevelModificationOverflow,
45    /// Got a malformed isolating run sequence structure.
46    MalformedIsolatingRunSequence,
47    /// Got a malformed bidi level override status stack.
48    MalformedOverrideStatusStack,
49  }
50}
51
52fn cure_char_inner(code: u32, options: Options) -> Translation {
53  let code_lowercased = char::from_u32(code)
54    .and_then(|character| character.to_lowercase().next())
55    .unwrap() as _;
56
57  let is_case_sensitive = code != code_lowercased;
58
59  #[cfg(feature = "options")]
60  let retain_capitalization = options.is(0);
61
62  #[cfg(feature = "options")]
63  let ascii_only = options.is(23);
64
65  #[cfg(feature = "options")]
66  let alphanumeric_only = options.is(24);
67
68  #[cfg(feature = "options")]
69  let default_output = if is_case_sensitive && retain_capitalization {
70    code
71  } else {
72    code_lowercased
73  };
74
75  #[cfg(not(feature = "options"))]
76  let default_output = code_lowercased;
77
78  if default_output < 0x80 {
79    #[cfg(feature = "options")]
80    if alphanumeric_only && !is_alphanumeric(default_output) {
81      return Translation::None;
82    }
83
84    return Translation::character(default_output);
85  } else if is_case_sensitive {
86    #[cfg_attr(not(feature = "options"), allow(unused_mut))]
87    if let Some(mut translation) = options.translate(
88      code,
89      CASE_SENSITIVE_CODEPOINTS_OFFSET as _,
90      CASE_SENSITIVE_CODEPOINTS_COUNT as _,
91    ) {
92      #[cfg(feature = "options")]
93      if retain_capitalization {
94        translation.make_uppercase();
95      }
96
97      #[cfg(feature = "options")]
98      return translation.ensure_stripped_if(ascii_only, alphanumeric_only);
99
100      #[cfg(not(feature = "options"))]
101      return translation;
102    }
103  }
104
105  #[cfg(feature = "options")]
106  match options.translate(code_lowercased, 6, CODEPOINTS_COUNT as _) {
107    Some(translation) => translation.ensure_stripped_if(ascii_only, alphanumeric_only),
108    None => {
109      if ascii_only || alphanumeric_only {
110        Translation::None
111      } else {
112        Translation::character(default_output)
113      }
114    },
115  }
116
117  #[cfg(not(feature = "options"))]
118  options
119    .translate(code_lowercased, 6, CODEPOINTS_COUNT as _)
120    .unwrap_or_else(|| Translation::character(default_output))
121}
122
123/// Cures a single character/unicode codepoint with the specified [`Options`].
124///
125/// To use this function with decancer's default options, use [the `cure_char` macro][cure_char!] instead.
126pub fn cure_char<C: Into<u32>>(code: C, options: Options) -> Translation {
127  let code = code.into();
128
129  if is_none(code) {
130    Translation::None
131  } else {
132    match Class::new(code) {
133      Some(Class::WS) => Translation::character(if code > 0x7f { 0x20 } else { code }),
134      None => Translation::None,
135      _ => cure_char_inner(code, options),
136    }
137  }
138}
139
140/// Cures a single character/unicode codepoint with decancer's default options.
141///
142/// Output will always be in lowercase.
143///
144/// If you plan on only using this macro, it's recommended to disable the default `options` feature flag to optimize away unnecessary option checks.
145///
146/// This macro expands to:
147///
148/// ```rust,ignore
149/// decancer::cure_char(code, decancer::Options::default());
150/// ```
151///
152/// For more information, see [the `cure_char` function][cure_char()].
153#[macro_export]
154macro_rules! cure_char {
155  ($code:expr) => {
156    $crate::cure_char($code, $crate::Options::default())
157  };
158}
159
160fn first_cure_pass(input: &str) -> (String, Vec<Class>, Vec<Paragraph>) {
161  let mut refined_input = String::with_capacity(input.len());
162  let mut original_classes = Vec::with_capacity(input.len());
163  let mut isolate_stack = Vec::new();
164
165  let mut paragraphs = Vec::new();
166  let mut paragraph_start = 0;
167  let mut paragraph_level: Option<Level> = None;
168  let mut pure_ltr = true;
169  let mut has_isolate_controls = false;
170
171  let mut idx = 0;
172
173  for codepoint in input.chars() {
174    let mut character_len = codepoint.len_utf8();
175    let mut codepoint = codepoint as u32;
176
177    if !is_none(codepoint) {
178      if let Some(class) = Class::new(codepoint) {
179        if class == Class::WS && codepoint > 0x7f {
180          character_len = 1;
181          codepoint = 0x20;
182        }
183
184        original_classes.resize(original_classes.len() + character_len, class);
185
186        match class {
187          Class::B => {
188            let paragraph_end = idx + character_len;
189
190            paragraphs.push(Paragraph {
191              range: paragraph_start..paragraph_end,
192              level: paragraph_level.unwrap_or(Level::ltr()),
193              pure_ltr,
194              has_isolate_controls,
195            });
196
197            paragraph_start = paragraph_end;
198            pure_ltr = true;
199            has_isolate_controls = false;
200            isolate_stack.clear();
201            paragraph_level = None;
202          },
203
204          Class::L | Class::R | Class::AL => {
205            if class != Class::L {
206              pure_ltr = false;
207            }
208
209            match isolate_stack.last() {
210              Some(&start_idx) => {
211                if original_classes[start_idx] == Class::FSI {
212                  let new_class = if class == Class::L {
213                    Class::LRI
214                  } else {
215                    Class::RLI
216                  };
217
218                  for j in 0..3 {
219                    original_classes[start_idx + j] = new_class;
220                  }
221                }
222              },
223
224              None => {
225                if paragraph_level.is_none() {
226                  paragraph_level.replace(if class == Class::L {
227                    Level::ltr()
228                  } else {
229                    Level::rtl()
230                  });
231                }
232              },
233            }
234          },
235
236          Class::AN | Class::LRE | Class::RLE | Class::LRO | Class::RLO => {
237            pure_ltr = false;
238          },
239
240          Class::RLI | Class::LRI | Class::FSI => {
241            pure_ltr = false;
242            has_isolate_controls = true;
243            isolate_stack.push(idx);
244          },
245
246          Class::PDI => {
247            isolate_stack.pop();
248          },
249
250          _ => {},
251        }
252
253        refined_input.push(char::from_u32(codepoint).unwrap());
254
255        idx += character_len;
256      }
257    }
258  }
259
260  if paragraph_start < idx {
261    paragraphs.push(Paragraph {
262      range: paragraph_start..idx,
263      level: paragraph_level.unwrap_or(Level::ltr()),
264      pure_ltr,
265      has_isolate_controls,
266    });
267  }
268
269  (refined_input, original_classes, paragraphs)
270}
271
272pub(crate) fn cure_reordered(input: &str, options: Options) -> Result<String, Error> {
273  let (refined_input, original_classes, paragraphs) = first_cure_pass(input);
274
275  let mut levels = Vec::with_capacity(refined_input.len());
276  let mut level_runs = Vec::new();
277  let mut processing_classes = original_classes.clone();
278  let mut output = String::with_capacity(refined_input.len());
279  let mut sequences = Vec::new();
280
281  for paragraph in &paragraphs {
282    levels.resize(levels.len() + paragraph.range.len(), paragraph.level);
283
284    if paragraph.level.0 != 0 || !paragraph.pure_ltr {
285      let input = paragraph.sliced(&refined_input);
286      let original_classes = paragraph.sliced(&original_classes);
287      let processing_classes = paragraph.sliced_mut(&mut processing_classes);
288      let levels = paragraph.sliced_mut(&mut levels);
289      level_runs.clear();
290
291      paragraph.compute_explicit(
292        input,
293        original_classes,
294        processing_classes,
295        levels,
296        &mut level_runs,
297      )?;
298
299      sequences.clear();
300      paragraph.isolating_run_sequences(levels, &level_runs, original_classes, &mut sequences)?;
301
302      for sequence in &sequences {
303        sequence.resolve_implicit_weak(input, processing_classes);
304        sequence.resolve_implicit_neutral(input, processing_classes, levels);
305      }
306
307      for j in 0..levels.len() {
308        {
309          let level = &mut levels[j];
310
311          match (level.is_rtl(), processing_classes[j]) {
312            (false, Class::AN | Class::EN) => level.raise(2)?,
313            (false, Class::R) | (true, Class::L | Class::EN | Class::AN) => {
314              level.raise(1)?;
315            },
316            _ => {},
317          }
318        }
319
320        if original_classes[j].removed_by_x9() {
321          levels[j] = if j > 0 {
322            levels[j - 1]
323          } else {
324            paragraph.level
325          };
326        }
327      }
328    }
329  }
330
331  for paragraph in paragraphs {
332    let (revised_levels, runs) =
333      paragraph.visual_runs(&refined_input, &original_classes, &levels)?;
334
335    for run in runs {
336      let text = &refined_input[run.clone()];
337
338      if revised_levels[run.start].is_rtl() {
339        for c in text.chars().rev() {
340          output += cure_char_inner(c as _, options);
341        }
342      } else {
343        for c in text.chars() {
344          output += cure_char_inner(c as _, options);
345        }
346      }
347    }
348  }
349
350  Ok(output)
351}
352
353/// Cures a string with the specified [`Options`].
354///
355/// To use this function with decancer's default options, use [the `cure` macro][cure!] instead.
356///
357/// # Errors
358///
359/// Errors if the string is malformed to the point where it's not possible to apply unicode's [bidirectional algorithm](https://en.wikipedia.org/wiki/Bidirectional_text) to it. This error is possible if [`Options::disable_bidi`] is disabled.
360pub fn cure(input: &str, options: Options) -> Result<CuredString, Error> {
361  Ok(CuredString({
362    #[cfg(feature = "options")]
363    if options.is(1) {
364      input
365        .chars()
366        .filter(|&character| !is_special_rtl(character as _))
367        .fold(
368          String::with_capacity(input.len()),
369          |mut output, character| {
370            output += cure_char(character, options);
371            output
372          },
373        )
374    } else {
375      cure_reordered(input, options)?
376    }
377
378    #[cfg(not(feature = "options"))]
379    cure_reordered(input, options)?
380  }))
381}
382
383/// Cures a string with decancer's default options.
384///
385/// Output will always be in lowercase and [bidirectionally reordered](https://en.wikipedia.org/wiki/Bidirectional_text) in order to treat right-to-left characters. Therefore, the string output is laid out in memory the same way as it were to be displayed graphically, but **may break if displayed graphically** since some right-to-left characters are reversed.
386///
387/// If you plan on only using this macro, it's recommended to disable the default `options` feature flag to optimize away unnecessary option checks.
388///
389/// This macro expands to:
390///
391/// ```rust,ignore
392/// decancer::cure(string, decancer::Options::default());
393/// ```
394///
395/// For more information, see [the `cure` function][cure()].
396///
397/// # Errors
398///
399/// Errors if the string is malformed to the point where it's not possible to apply unicode's [bidirectional algorithm](https://en.wikipedia.org/wiki/Bidirectional_text) to it.
400///
401/// # Examples
402///
403/// Basic usage:
404///
405/// ```rust
406/// let cured = decancer::cure!("vοΌ₯ⓑ𝔂 π”½π•ŒΕ‡β„•ο½™ ţ乇𝕏𝓣").unwrap();
407///
408/// assert_eq!(cured, "very funny text");
409/// assert!(cured.contains("FuNny"));
410/// ```
411#[macro_export]
412macro_rules! cure {
413  ($string:expr) => {
414    $crate::cure($string, $crate::Options::default())
415  };
416}