1#![doc = include_str!("../README.md")]
2#![allow(clippy::upper_case_acronyms)]
3#![cfg_attr(docsrs, feature(doc_cfg))]
4#![forbid(unsafe_code, rustdoc::broken_intra_doc_links)]
5
6mod bidi;
7mod codepoints;
8#[cfg(feature = "leetspeak")]
9mod leetspeak;
10mod options;
11mod similar;
12mod string;
13#[cfg(test)]
14mod tests;
15mod translation;
16mod util;
17
18use bidi::{Class, Level, Paragraph};
19pub use options::Options;
20pub use similar::Matcher;
21pub use string::CuredString;
22pub use translation::Translation;
23
24use codepoints::{
25 CASE_SENSITIVE_CODEPOINTS_COUNT, CASE_SENSITIVE_CODEPOINTS_OFFSET, CODEPOINTS_COUNT,
26};
27
28use util::{error_enum, is_none};
29#[cfg(feature = "options")]
30use util::{is_alphanumeric, is_special_rtl};
31
32error_enum! {
33 #[repr(u8)]
35 #[derive(Copy, Clone, Debug)]
36 pub enum Error {
37 LevelExplicitOverflow,
39 LevelImplicitOverflow,
41 LevelModificationUnderflow,
43 LevelModificationOverflow,
45 MalformedIsolatingRunSequence,
47 MalformedOverrideStatusStack,
49 }
50}
51
52fn cure_char_inner(code: u32, options: Options) -> Translation {
53 let code_lowercased = char::from_u32(code)
54 .and_then(|character| character.to_lowercase().next())
55 .unwrap() as _;
56
57 let is_case_sensitive = code != code_lowercased;
58
59 #[cfg(feature = "options")]
60 let retain_capitalization = options.is(0);
61
62 #[cfg(feature = "options")]
63 let ascii_only = options.is(23);
64
65 #[cfg(feature = "options")]
66 let alphanumeric_only = options.is(24);
67
68 #[cfg(feature = "options")]
69 let default_output = if is_case_sensitive && retain_capitalization {
70 code
71 } else {
72 code_lowercased
73 };
74
75 #[cfg(not(feature = "options"))]
76 let default_output = code_lowercased;
77
78 if default_output < 0x80 {
79 #[cfg(feature = "options")]
80 if alphanumeric_only && !is_alphanumeric(default_output) {
81 return Translation::None;
82 }
83
84 return Translation::character(default_output);
85 } else if is_case_sensitive {
86 #[cfg_attr(not(feature = "options"), allow(unused_mut))]
87 if let Some(mut translation) = options.translate(
88 code,
89 CASE_SENSITIVE_CODEPOINTS_OFFSET as _,
90 CASE_SENSITIVE_CODEPOINTS_COUNT as _,
91 ) {
92 #[cfg(feature = "options")]
93 if retain_capitalization {
94 translation.make_uppercase();
95 }
96
97 #[cfg(feature = "options")]
98 return translation.ensure_stripped_if(ascii_only, alphanumeric_only);
99
100 #[cfg(not(feature = "options"))]
101 return translation;
102 }
103 }
104
105 #[cfg(feature = "options")]
106 match options.translate(code_lowercased, 6, CODEPOINTS_COUNT as _) {
107 Some(translation) => translation.ensure_stripped_if(ascii_only, alphanumeric_only),
108 None => {
109 if ascii_only || alphanumeric_only {
110 Translation::None
111 } else {
112 Translation::character(default_output)
113 }
114 },
115 }
116
117 #[cfg(not(feature = "options"))]
118 options
119 .translate(code_lowercased, 6, CODEPOINTS_COUNT as _)
120 .unwrap_or_else(|| Translation::character(default_output))
121}
122
123pub fn cure_char<C: Into<u32>>(code: C, options: Options) -> Translation {
127 let code = code.into();
128
129 if is_none(code) {
130 Translation::None
131 } else {
132 match Class::new(code) {
133 Some(Class::WS) => Translation::character(if code > 0x7f { 0x20 } else { code }),
134 None => Translation::None,
135 _ => cure_char_inner(code, options),
136 }
137 }
138}
139
140#[macro_export]
154macro_rules! cure_char {
155 ($code:expr) => {
156 $crate::cure_char($code, $crate::Options::default())
157 };
158}
159
160fn first_cure_pass(input: &str) -> (String, Vec<Class>, Vec<Paragraph>) {
161 let mut refined_input = String::with_capacity(input.len());
162 let mut original_classes = Vec::with_capacity(input.len());
163 let mut isolate_stack = Vec::new();
164
165 let mut paragraphs = Vec::new();
166 let mut paragraph_start = 0;
167 let mut paragraph_level: Option<Level> = None;
168 let mut pure_ltr = true;
169 let mut has_isolate_controls = false;
170
171 let mut idx = 0;
172
173 for codepoint in input.chars() {
174 let mut character_len = codepoint.len_utf8();
175 let mut codepoint = codepoint as u32;
176
177 if !is_none(codepoint) {
178 if let Some(class) = Class::new(codepoint) {
179 if class == Class::WS && codepoint > 0x7f {
180 character_len = 1;
181 codepoint = 0x20;
182 }
183
184 original_classes.resize(original_classes.len() + character_len, class);
185
186 match class {
187 Class::B => {
188 let paragraph_end = idx + character_len;
189
190 paragraphs.push(Paragraph {
191 range: paragraph_start..paragraph_end,
192 level: paragraph_level.unwrap_or(Level::ltr()),
193 pure_ltr,
194 has_isolate_controls,
195 });
196
197 paragraph_start = paragraph_end;
198 pure_ltr = true;
199 has_isolate_controls = false;
200 isolate_stack.clear();
201 paragraph_level = None;
202 },
203
204 Class::L | Class::R | Class::AL => {
205 if class != Class::L {
206 pure_ltr = false;
207 }
208
209 match isolate_stack.last() {
210 Some(&start_idx) => {
211 if original_classes[start_idx] == Class::FSI {
212 let new_class = if class == Class::L {
213 Class::LRI
214 } else {
215 Class::RLI
216 };
217
218 for j in 0..3 {
219 original_classes[start_idx + j] = new_class;
220 }
221 }
222 },
223
224 None => {
225 if paragraph_level.is_none() {
226 paragraph_level.replace(if class == Class::L {
227 Level::ltr()
228 } else {
229 Level::rtl()
230 });
231 }
232 },
233 }
234 },
235
236 Class::AN | Class::LRE | Class::RLE | Class::LRO | Class::RLO => {
237 pure_ltr = false;
238 },
239
240 Class::RLI | Class::LRI | Class::FSI => {
241 pure_ltr = false;
242 has_isolate_controls = true;
243 isolate_stack.push(idx);
244 },
245
246 Class::PDI => {
247 isolate_stack.pop();
248 },
249
250 _ => {},
251 }
252
253 refined_input.push(char::from_u32(codepoint).unwrap());
254
255 idx += character_len;
256 }
257 }
258 }
259
260 if paragraph_start < idx {
261 paragraphs.push(Paragraph {
262 range: paragraph_start..idx,
263 level: paragraph_level.unwrap_or(Level::ltr()),
264 pure_ltr,
265 has_isolate_controls,
266 });
267 }
268
269 (refined_input, original_classes, paragraphs)
270}
271
272pub(crate) fn cure_reordered(input: &str, options: Options) -> Result<String, Error> {
273 let (refined_input, original_classes, paragraphs) = first_cure_pass(input);
274
275 let mut levels = Vec::with_capacity(refined_input.len());
276 let mut level_runs = Vec::new();
277 let mut processing_classes = original_classes.clone();
278 let mut output = String::with_capacity(refined_input.len());
279 let mut sequences = Vec::new();
280
281 for paragraph in ¶graphs {
282 levels.resize(levels.len() + paragraph.range.len(), paragraph.level);
283
284 if paragraph.level.0 != 0 || !paragraph.pure_ltr {
285 let input = paragraph.sliced(&refined_input);
286 let original_classes = paragraph.sliced(&original_classes);
287 let processing_classes = paragraph.sliced_mut(&mut processing_classes);
288 let levels = paragraph.sliced_mut(&mut levels);
289 level_runs.clear();
290
291 paragraph.compute_explicit(
292 input,
293 original_classes,
294 processing_classes,
295 levels,
296 &mut level_runs,
297 )?;
298
299 sequences.clear();
300 paragraph.isolating_run_sequences(levels, &level_runs, original_classes, &mut sequences)?;
301
302 for sequence in &sequences {
303 sequence.resolve_implicit_weak(input, processing_classes);
304 sequence.resolve_implicit_neutral(input, processing_classes, levels);
305 }
306
307 for j in 0..levels.len() {
308 {
309 let level = &mut levels[j];
310
311 match (level.is_rtl(), processing_classes[j]) {
312 (false, Class::AN | Class::EN) => level.raise(2)?,
313 (false, Class::R) | (true, Class::L | Class::EN | Class::AN) => {
314 level.raise(1)?;
315 },
316 _ => {},
317 }
318 }
319
320 if original_classes[j].removed_by_x9() {
321 levels[j] = if j > 0 {
322 levels[j - 1]
323 } else {
324 paragraph.level
325 };
326 }
327 }
328 }
329 }
330
331 for paragraph in paragraphs {
332 let (revised_levels, runs) =
333 paragraph.visual_runs(&refined_input, &original_classes, &levels)?;
334
335 for run in runs {
336 let text = &refined_input[run.clone()];
337
338 if revised_levels[run.start].is_rtl() {
339 for c in text.chars().rev() {
340 output += cure_char_inner(c as _, options);
341 }
342 } else {
343 for c in text.chars() {
344 output += cure_char_inner(c as _, options);
345 }
346 }
347 }
348 }
349
350 Ok(output)
351}
352
353pub fn cure(input: &str, options: Options) -> Result<CuredString, Error> {
361 Ok(CuredString({
362 #[cfg(feature = "options")]
363 if options.is(1) {
364 input
365 .chars()
366 .filter(|&character| !is_special_rtl(character as _))
367 .fold(
368 String::with_capacity(input.len()),
369 |mut output, character| {
370 output += cure_char(character, options);
371 output
372 },
373 )
374 } else {
375 cure_reordered(input, options)?
376 }
377
378 #[cfg(not(feature = "options"))]
379 cure_reordered(input, options)?
380 }))
381}
382
383#[macro_export]
412macro_rules! cure {
413 ($string:expr) => {
414 $crate::cure($string, $crate::Options::default())
415 };
416}