1use crate::error::Result;
7use lazy_static::lazy_static;
8use regex::Regex;
9use std::collections::HashMap;
10
11lazy_static! {
12 static ref HTML_TAG_PATTERN: Regex = Regex::new(r"<[^>]+>").unwrap();
14
15 static ref URL_PATTERN: Regex = Regex::new(
17 r"(?i)https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&/=]*)"
18 ).unwrap();
19
20 static ref EMAIL_PATTERN: Regex = Regex::new(
22 r"(?i)[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
23 ).unwrap();
24
25 static ref PHONE_PATTERN: Regex = Regex::new(
27 r"(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})"
28 ).unwrap();
29
30 static ref CONTRACTIONS: HashMap<&'static str, &'static str> = {
32 let mut m = HashMap::new();
33 m.insert("can't", "cannot");
34 m.insert("won't", "will not");
35 m.insert("n't", " not");
36 m.insert("'re", " are");
37 m.insert("'ve", " have");
38 m.insert("'ll", " will");
39 m.insert("'d", " would");
40 m.insert("'m", " am");
41 m.insert("'s", " is");
42 m.insert("let's", "let us");
43 m.insert("it's", "it is");
44 m.insert("that's", "that is");
45 m.insert("what's", "what is");
46 m.insert("where's", "where is");
47 m.insert("who's", "who is");
48 m.insert("there's", "there is");
49 m.insert("here's", "here is");
50 m
51 };
52
53 static ref EMOJI_PATTERN: Regex = Regex::new(
55 concat!(
56 "[",
57 "\u{1F600}-\u{1F64F}", "\u{1F300}-\u{1F5FF}", "\u{1F680}-\u{1F6FF}", "\u{1F700}-\u{1F77F}", "\u{1F780}-\u{1F7FF}", "\u{1F800}-\u{1F8FF}", "\u{2600}-\u{26FF}", "\u{2700}-\u{27BF}", "]"
66 )
67 ).unwrap();
68
69 static ref NUMBER_PATTERN: Regex = Regex::new(
71 r"(?i)\b[-+]?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?(?:[eE][-+]?\d+)?\b"
72 ).unwrap();
73
74 static ref CURRENCY_PATTERN: Regex = Regex::new(
76 r"(?i)(?:[$€£¥₹])[ \t]*(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?|(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?[ \t]*(?:dollars?|euros?|pounds?|yen|rupees?|USD|EUR|GBP|JPY|INR)\b"
77 ).unwrap();
78
79 static ref PERCENTAGE_PATTERN: Regex = Regex::new(
81 r"(?i)[-+]?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?%"
82 ).unwrap();
83
84 static ref ORDINAL_PATTERN: Regex = Regex::new(
86 r"(?i)\b(\d+)(?:st|nd|rd|th)\b"
87 ).unwrap();
88
89 static ref DATE_PATTERN: Regex = Regex::new(
93 r"(?i)\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:19|20)?\d{2})|(?:(?:19|20)\d{2}[/-](?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01]))|(?:(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{1,2},?\s+\d{4})|(?:\d{1,2}\s+(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{4})\b"
94 ).unwrap();
95
96 static ref TIME_PATTERN: Regex = Regex::new(
98 r"(?i)\b(?:[01]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9])?(?:\s*[aApP][mM])?\b"
99 ).unwrap();
100
101 static ref FRACTION_PATTERN: Regex = Regex::new(
103 r"\b\d+\s*/\s*\d+\b|\b\d+\s+\d+\s*/\s*\d+\b"
104 ).unwrap();
105
106 static ref ROMAN_NUMERAL_PATTERN: Regex = Regex::new(
108 r"(?i)\b[MDCLXVI]+\b"
109 ).unwrap();
110
111 static ref SCIENTIFIC_NOTATION_PATTERN: Regex = Regex::new(
113 r"(?i)[-+]?(?:\d+\.?\d*|\.\d+)[eE][-+]?\d+"
114 ).unwrap();
115
116 static ref TEMPERATURE_PATTERN: Regex = Regex::new(
118 r"(?i)[-+]?(?:\d+\.?\d*|\.\d+)\s*(?:°[CFK]|[CFK](?:\s+degrees?)?\s*(?:celsius|fahrenheit|kelvin)?|degrees?\s+(?:celsius|fahrenheit|kelvin))\b"
119 ).unwrap();
120
121 static ref MEASUREMENT_PATTERN: Regex = Regex::new(
123 r"(?i)[-+]?(?:\d+\.?\d*|\.\d+)\s*(?:mm|cm|m|km|in|ft|yd|mi|g|kg|lb|oz|ml|l|gal|mph|kph|Hz|kHz|MHz|GHz|KB|MB|GB|TB|°|rad|sq|cu)\b"
124 ).unwrap();
125
126 static ref ENHANCED_CURRENCY_PATTERN: Regex = Regex::new(
128 r"(?i)(?:[$€£¥₹₽₩¢₪₨₦₴₵₡₲₱₫₭₦₨]\s*\d+(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:,\d{3})*(?:\.\d{1,2})?\s*(?:USD|EUR|GBP|JPY|INR|RUB|KRW|CNY|CAD|AUD|CHF|SGD|dollars?|euros?|pounds?|yen|rupees?|yuan|won|rubles?))\b"
129 ).unwrap();
130
131 static ref VERSION_PATTERN: Regex = Regex::new(
133 r"\bv?\d+(?:\.\d+){1,3}(?:-[a-zA-Z]+\d*)?\b"
134 ).unwrap();
135
136 static ref IP_ADDRESS_PATTERN: Regex = Regex::new(
138 r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"
139 ).unwrap();
140
141 static ref HEX_PATTERN: Regex = Regex::new(
143 r"(?i)\b0x[0-9a-f]+\b|#[0-9a-f]{3,8}\b"
144 ).unwrap();
145
146 static ref BINARY_PATTERN: Regex = Regex::new(
148 r"\b0b[01]+\b"
149 ).unwrap();
150}
151
152#[derive(Debug, Clone)]
154pub struct AdvancedTextCleaner {
155 strip_html: bool,
156 replace_urls: bool,
157 replace_emails: bool,
158 replace_phone_numbers: bool,
159 expand_contractions: bool,
160 remove_emojis: bool,
161 normalize_unicode: bool,
162 preserve_case: bool,
163 normalize_numbers: bool,
164 normalize_currencies: bool,
165 normalize_percentages: bool,
166 normalize_ordinals: bool,
167 normalize_dates: bool,
168 normalize_times: bool,
169 normalize_fractions: bool,
170 normalize_roman_numerals: bool,
171 normalize_scientific_notation: bool,
172 normalize_temperatures: bool,
173 normalize_measurements: bool,
174 normalize_versions: bool,
175 normalize_ip_addresses: bool,
176 normalize_hex_numbers: bool,
177 normalize_binary_numbers: bool,
178 url_placeholder: String,
179 email_placeholder: String,
180 phone_placeholder: String,
181 number_placeholder: String,
182 currency_placeholder: String,
183 percentage_placeholder: String,
184 ordinal_placeholder: String,
185 date_placeholder: String,
186 time_placeholder: String,
187 fraction_placeholder: String,
188 roman_placeholder: String,
189 scientific_placeholder: String,
190 temperature_placeholder: String,
191 measurement_placeholder: String,
192 version_placeholder: String,
193 ip_placeholder: String,
194 hex_placeholder: String,
195 binary_placeholder: String,
196}
197
198impl AdvancedTextCleaner {
199 pub fn new() -> Self {
201 Self {
202 strip_html: true,
203 replace_urls: true,
204 replace_emails: true,
205 replace_phone_numbers: false,
206 expand_contractions: true,
207 remove_emojis: false,
208 normalize_unicode: true,
209 preserve_case: false,
210 normalize_numbers: false,
211 normalize_currencies: false,
212 normalize_percentages: false,
213 normalize_ordinals: false,
214 normalize_dates: false,
215 normalize_times: false,
216 normalize_fractions: false,
217 normalize_roman_numerals: false,
218 normalize_scientific_notation: false,
219 normalize_temperatures: false,
220 normalize_measurements: false,
221 normalize_versions: false,
222 normalize_ip_addresses: false,
223 normalize_hex_numbers: false,
224 normalize_binary_numbers: false,
225 url_placeholder: "[URL]".to_string(),
226 email_placeholder: "[EMAIL]".to_string(),
227 phone_placeholder: "[PHONE]".to_string(),
228 number_placeholder: "[NUMBER]".to_string(),
229 currency_placeholder: "[CURRENCY]".to_string(),
230 percentage_placeholder: "[PERCENT]".to_string(),
231 ordinal_placeholder: "[ORDINAL]".to_string(),
232 date_placeholder: "[DATE]".to_string(),
233 time_placeholder: "[TIME]".to_string(),
234 fraction_placeholder: "[FRACTION]".to_string(),
235 roman_placeholder: "[ROMAN]".to_string(),
236 scientific_placeholder: "[SCIENTIFIC]".to_string(),
237 temperature_placeholder: "[TEMP]".to_string(),
238 measurement_placeholder: "[MEASURE]".to_string(),
239 version_placeholder: "[VERSION]".to_string(),
240 ip_placeholder: "[IP]".to_string(),
241 hex_placeholder: "[HEX]".to_string(),
242 binary_placeholder: "[BINARY]".to_string(),
243 }
244 }
245
246 pub fn privacy_focused() -> Self {
248 Self {
249 strip_html: true,
250 replace_urls: true,
251 replace_emails: true,
252 replace_phone_numbers: true,
253 expand_contractions: true,
254 remove_emojis: false,
255 normalize_unicode: true,
256 preserve_case: false,
257 normalize_numbers: false,
258 normalize_currencies: false,
259 normalize_percentages: false,
260 normalize_ordinals: false,
261 normalize_dates: true, normalize_times: true, normalize_fractions: false,
264 normalize_roman_numerals: false,
265 normalize_scientific_notation: false,
266 normalize_temperatures: false,
267 normalize_measurements: false,
268 normalize_versions: false,
269 normalize_ip_addresses: true, normalize_hex_numbers: false,
271 normalize_binary_numbers: false,
272 url_placeholder: "[URL]".to_string(),
273 email_placeholder: "[EMAIL]".to_string(),
274 phone_placeholder: "[PHONE]".to_string(),
275 number_placeholder: "[NUMBER]".to_string(),
276 currency_placeholder: "[CURRENCY]".to_string(),
277 percentage_placeholder: "[PERCENT]".to_string(),
278 ordinal_placeholder: "[ORDINAL]".to_string(),
279 date_placeholder: "[DATE]".to_string(),
280 time_placeholder: "[TIME]".to_string(),
281 fraction_placeholder: "[FRACTION]".to_string(),
282 roman_placeholder: "[ROMAN]".to_string(),
283 scientific_placeholder: "[SCIENTIFIC]".to_string(),
284 temperature_placeholder: "[TEMP]".to_string(),
285 measurement_placeholder: "[MEASURE]".to_string(),
286 version_placeholder: "[VERSION]".to_string(),
287 ip_placeholder: "[IP]".to_string(),
288 hex_placeholder: "[HEX]".to_string(),
289 binary_placeholder: "[BINARY]".to_string(),
290 }
291 }
292
293 pub fn minimal() -> Self {
295 Self {
296 strip_html: true,
297 replace_urls: false,
298 replace_emails: false,
299 replace_phone_numbers: false,
300 expand_contractions: false,
301 remove_emojis: false,
302 normalize_unicode: true,
303 preserve_case: true,
304 normalize_numbers: false,
305 normalize_currencies: false,
306 normalize_percentages: false,
307 normalize_ordinals: false,
308 normalize_dates: false,
309 normalize_times: false,
310 normalize_fractions: false,
311 normalize_roman_numerals: false,
312 normalize_scientific_notation: false,
313 normalize_temperatures: false,
314 normalize_measurements: false,
315 normalize_versions: false,
316 normalize_ip_addresses: false,
317 normalize_hex_numbers: false,
318 normalize_binary_numbers: false,
319 url_placeholder: "[URL]".to_string(),
320 email_placeholder: "[EMAIL]".to_string(),
321 phone_placeholder: "[PHONE]".to_string(),
322 number_placeholder: "[NUMBER]".to_string(),
323 currency_placeholder: "[CURRENCY]".to_string(),
324 percentage_placeholder: "[PERCENT]".to_string(),
325 ordinal_placeholder: "[ORDINAL]".to_string(),
326 date_placeholder: "[DATE]".to_string(),
327 time_placeholder: "[TIME]".to_string(),
328 fraction_placeholder: "[FRACTION]".to_string(),
329 roman_placeholder: "[ROMAN]".to_string(),
330 scientific_placeholder: "[SCIENTIFIC]".to_string(),
331 temperature_placeholder: "[TEMP]".to_string(),
332 measurement_placeholder: "[MEASURE]".to_string(),
333 version_placeholder: "[VERSION]".to_string(),
334 ip_placeholder: "[IP]".to_string(),
335 hex_placeholder: "[HEX]".to_string(),
336 binary_placeholder: "[BINARY]".to_string(),
337 }
338 }
339
340 pub fn set_strip_html(mut self, value: bool) -> Self {
342 self.strip_html = value;
343 self
344 }
345
346 pub fn set_replace_urls(mut self, value: bool) -> Self {
348 self.replace_urls = value;
349 self
350 }
351
352 pub fn set_replace_emails(mut self, value: bool) -> Self {
354 self.replace_emails = value;
355 self
356 }
357
358 pub fn set_replace_phone_numbers(mut self, value: bool) -> Self {
360 self.replace_phone_numbers = value;
361 self
362 }
363
364 pub fn set_expand_contractions(mut self, value: bool) -> Self {
366 self.expand_contractions = value;
367 self
368 }
369
370 pub fn set_remove_emojis(mut self, value: bool) -> Self {
372 self.remove_emojis = value;
373 self
374 }
375
376 pub fn set_normalize_numbers(mut self, value: bool) -> Self {
378 self.normalize_numbers = value;
379 self
380 }
381
382 pub fn set_normalize_currencies(mut self, value: bool) -> Self {
384 self.normalize_currencies = value;
385 self
386 }
387
388 pub fn set_normalize_percentages(mut self, value: bool) -> Self {
390 self.normalize_percentages = value;
391 self
392 }
393
394 pub fn set_normalize_ordinals(mut self, value: bool) -> Self {
396 self.normalize_ordinals = value;
397 self
398 }
399
400 pub fn set_normalize_dates(mut self, value: bool) -> Self {
402 self.normalize_dates = value;
403 self
404 }
405
406 pub fn set_normalize_times(mut self, value: bool) -> Self {
408 self.normalize_times = value;
409 self
410 }
411
412 pub fn set_normalize_fractions(mut self, value: bool) -> Self {
414 self.normalize_fractions = value;
415 self
416 }
417
418 pub fn set_normalize_roman_numerals(mut self, value: bool) -> Self {
420 self.normalize_roman_numerals = value;
421 self
422 }
423
424 pub fn set_normalize_scientific_notation(mut self, value: bool) -> Self {
426 self.normalize_scientific_notation = value;
427 self
428 }
429
430 pub fn set_normalize_temperatures(mut self, value: bool) -> Self {
432 self.normalize_temperatures = value;
433 self
434 }
435
436 pub fn set_normalize_measurements(mut self, value: bool) -> Self {
438 self.normalize_measurements = value;
439 self
440 }
441
442 pub fn set_normalize_versions(mut self, value: bool) -> Self {
444 self.normalize_versions = value;
445 self
446 }
447
448 pub fn set_normalize_ip_addresses(mut self, value: bool) -> Self {
450 self.normalize_ip_addresses = value;
451 self
452 }
453
454 pub fn set_normalize_hex_numbers(mut self, value: bool) -> Self {
456 self.normalize_hex_numbers = value;
457 self
458 }
459
460 pub fn set_normalize_binary_numbers(mut self, value: bool) -> Self {
462 self.normalize_binary_numbers = value;
463 self
464 }
465
466 pub fn set_placeholders(
468 mut self,
469 url: Option<String>,
470 email: Option<String>,
471 phone: Option<String>,
472 ) -> Self {
473 if let Some(u) = url {
474 self.url_placeholder = u;
475 }
476 if let Some(e) = email {
477 self.email_placeholder = e;
478 }
479 if let Some(p) = phone {
480 self.phone_placeholder = p;
481 }
482 self
483 }
484
485 pub fn set_number_placeholders(
487 mut self,
488 number: Option<String>,
489 currency: Option<String>,
490 percentage: Option<String>,
491 ordinal: Option<String>,
492 ) -> Self {
493 if let Some(n) = number {
494 self.number_placeholder = n;
495 }
496 if let Some(c) = currency {
497 self.currency_placeholder = c;
498 }
499 if let Some(p) = percentage {
500 self.percentage_placeholder = p;
501 }
502 if let Some(o) = ordinal {
503 self.ordinal_placeholder = o;
504 }
505 self
506 }
507
508 pub fn set_advanced_placeholders(
510 mut self,
511 date: Option<String>,
512 time: Option<String>,
513 fraction: Option<String>,
514 roman: Option<String>,
515 scientific: Option<String>,
516 temperature: Option<String>,
517 measurement: Option<String>,
518 version: Option<String>,
519 ip: Option<String>,
520 hex: Option<String>,
521 binary: Option<String>,
522 ) -> Self {
523 if let Some(d) = date {
524 self.date_placeholder = d;
525 }
526 if let Some(t) = time {
527 self.time_placeholder = t;
528 }
529 if let Some(f) = fraction {
530 self.fraction_placeholder = f;
531 }
532 if let Some(r) = roman {
533 self.roman_placeholder = r;
534 }
535 if let Some(s) = scientific {
536 self.scientific_placeholder = s;
537 }
538 if let Some(temp) = temperature {
539 self.temperature_placeholder = temp;
540 }
541 if let Some(m) = measurement {
542 self.measurement_placeholder = m;
543 }
544 if let Some(v) = version {
545 self.version_placeholder = v;
546 }
547 if let Some(i) = ip {
548 self.ip_placeholder = i;
549 }
550 if let Some(h) = hex {
551 self.hex_placeholder = h;
552 }
553 if let Some(b) = binary {
554 self.binary_placeholder = b;
555 }
556 self
557 }
558
559 pub fn clean(&self, text: &str) -> Result<String> {
561 let mut cleaned = text.to_string();
562
563 if self.strip_html {
565 cleaned = strip_html_tags(&cleaned);
566 }
567
568 if self.replace_urls {
570 cleaned = URL_PATTERN
571 .replace_all(&cleaned, &self.url_placeholder)
572 .to_string();
573 }
574
575 if self.replace_emails {
577 cleaned = EMAIL_PATTERN
578 .replace_all(&cleaned, &self.email_placeholder)
579 .to_string();
580 }
581
582 if self.replace_phone_numbers {
584 cleaned = PHONE_PATTERN
585 .replace_all(&cleaned, &self.phone_placeholder)
586 .to_string();
587 }
588
589 if self.expand_contractions {
591 cleaned = expand_contractions(&cleaned);
592 }
593
594 if self.remove_emojis {
596 cleaned = EMOJI_PATTERN.replace_all(&cleaned, " ").to_string();
597 }
598
599 if self.normalize_dates {
603 cleaned = DATE_PATTERN
604 .replace_all(&cleaned, &self.date_placeholder)
605 .to_string();
606 }
607
608 if self.normalize_times {
610 cleaned = TIME_PATTERN
611 .replace_all(&cleaned, &self.time_placeholder)
612 .to_string();
613 }
614
615 if self.normalize_ip_addresses {
617 cleaned = IP_ADDRESS_PATTERN
618 .replace_all(&cleaned, &self.ip_placeholder)
619 .to_string();
620 }
621
622 if self.normalize_versions {
624 cleaned = VERSION_PATTERN
625 .replace_all(&cleaned, &self.version_placeholder)
626 .to_string();
627 }
628
629 if self.normalize_scientific_notation {
631 cleaned = SCIENTIFIC_NOTATION_PATTERN
632 .replace_all(&cleaned, &self.scientific_placeholder)
633 .to_string();
634 }
635
636 if self.normalize_temperatures {
638 cleaned = TEMPERATURE_PATTERN
639 .replace_all(&cleaned, &self.temperature_placeholder)
640 .to_string();
641 }
642
643 if self.normalize_measurements {
645 cleaned = MEASUREMENT_PATTERN
646 .replace_all(&cleaned, &self.measurement_placeholder)
647 .to_string();
648 }
649
650 if self.normalize_hex_numbers {
652 cleaned = HEX_PATTERN
653 .replace_all(&cleaned, &self.hex_placeholder)
654 .to_string();
655 }
656
657 if self.normalize_binary_numbers {
659 cleaned = BINARY_PATTERN
660 .replace_all(&cleaned, &self.binary_placeholder)
661 .to_string();
662 }
663
664 if self.normalize_currencies {
666 cleaned = ENHANCED_CURRENCY_PATTERN
667 .replace_all(&cleaned, &self.currency_placeholder)
668 .to_string();
669 }
670
671 if self.normalize_fractions {
673 cleaned = FRACTION_PATTERN
674 .replace_all(&cleaned, &self.fraction_placeholder)
675 .to_string();
676 }
677
678 if self.normalize_roman_numerals {
680 cleaned = ROMAN_NUMERAL_PATTERN
681 .replace_all(&cleaned, &self.roman_placeholder)
682 .to_string();
683 }
684
685 if self.normalize_percentages {
687 cleaned = PERCENTAGE_PATTERN
688 .replace_all(&cleaned, &self.percentage_placeholder)
689 .to_string();
690 }
691
692 if self.normalize_ordinals {
694 cleaned = ORDINAL_PATTERN
695 .replace_all(&cleaned, &self.ordinal_placeholder)
696 .to_string();
697 }
698
699 if self.normalize_numbers {
701 cleaned = NUMBER_PATTERN
702 .replace_all(&cleaned, &self.number_placeholder)
703 .to_string();
704 }
705
706 if self.normalize_unicode {
708 cleaned = normalize_unicode(&cleaned)?;
709 }
710
711 if !self.preserve_case {
713 cleaned = cleaned.to_lowercase();
714 }
715
716 cleaned = normalize_whitespace(&cleaned);
718
719 Ok(cleaned)
720 }
721}
722
723impl Default for AdvancedTextCleaner {
724 fn default() -> Self {
725 Self::new()
726 }
727}
728
729#[allow(dead_code)]
731pub fn strip_html_tags(text: &str) -> String {
732 HTML_TAG_PATTERN.replace_all(text, " ").to_string()
733}
734
735#[allow(dead_code)]
737pub fn replace_urls(text: &str, placeholder: &str) -> String {
738 URL_PATTERN.replace_all(text, placeholder).to_string()
739}
740
741#[allow(dead_code)]
743pub fn replace_emails(text: &str, placeholder: &str) -> String {
744 EMAIL_PATTERN.replace_all(text, placeholder).to_string()
745}
746
747#[allow(dead_code)]
749pub fn replace_phone_numbers(text: &str, placeholder: &str) -> String {
750 PHONE_PATTERN.replace_all(text, placeholder).to_string()
751}
752
753#[allow(dead_code)]
755pub fn expand_contractions(text: &str) -> String {
756 let mut result = text.to_string();
757
758 let mut contractions: Vec<_> = CONTRACTIONS.iter().collect();
760 contractions.sort_by_key(|(k_, _)| std::cmp::Reverse(k_.len()));
761
762 for (contraction, expansion) in contractions {
763 let escaped = regex::escape(contraction);
764 let pattern = format!(r"\b{escaped}\b");
765 if let Ok(re) = Regex::new(&pattern) {
766 result = re.replace_all(&result, *expansion).to_string();
767 }
768 }
769
770 result
771}
772
773#[allow(dead_code)]
775pub fn normalize_unicode(text: &str) -> Result<String> {
776 use unicode_normalization::UnicodeNormalization;
777 Ok(text.nfc().collect())
778}
779
780#[allow(dead_code)]
782pub fn normalize_whitespace(text: &str) -> String {
783 #[cfg(feature = "simd")]
784 {
785 if text.is_ascii() && crate::simd_ops::SimdStringOps::is_available() {
787 let positions = crate::simd_ops::SimdStringOps::find_whitespace_positions(text);
788 if positions.is_empty() {
789 return text.trim().to_string();
790 }
791 }
793 }
794
795 lazy_static! {
796 static ref WHITESPACE_PATTERN: Regex = Regex::new(r"\s+").unwrap();
797 }
798
799 WHITESPACE_PATTERN.replace_all(text.trim(), " ").to_string()
800}
801
802#[allow(dead_code)]
804pub fn remove_accents(text: &str) -> String {
805 use unicode_normalization::UnicodeNormalization;
806
807 text.nfd()
808 .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
809 .collect()
810}
811
812#[allow(dead_code)]
814pub fn normalize_dashes(text: &str) -> String {
815 lazy_static! {
816 static ref DASH_PATTERN: Regex = Regex::new(r"[\u{2010}-\u{2015}\u{2212}]").unwrap();
817 }
818
819 DASH_PATTERN.replace_all(text, "-").to_string()
820}
821
822#[allow(dead_code)]
824pub fn normalize_quotes(text: &str) -> String {
825 lazy_static! {
826 static ref SINGLE_QUOTE_PATTERN: Regex =
827 Regex::new(r"[\u{2018}\u{2019}\u{201A}\u{201B}]").unwrap();
828 static ref DOUBLE_QUOTE_PATTERN: Regex =
829 Regex::new(r"[\u{201C}\u{201D}\u{201E}\u{201F}]").unwrap();
830 }
831
832 let text = SINGLE_QUOTE_PATTERN.replace_all(text, "'");
833 DOUBLE_QUOTE_PATTERN.replace_all(&text, "\"").to_string()
834}
835
836#[allow(dead_code)]
838pub fn normalize_numbers(text: &str, placeholder: &str) -> String {
839 NUMBER_PATTERN.replace_all(text, placeholder).to_string()
840}
841
842#[allow(dead_code)]
844pub fn normalize_currencies(text: &str, placeholder: &str) -> String {
845 CURRENCY_PATTERN.replace_all(text, placeholder).to_string()
846}
847
848#[allow(dead_code)]
850pub fn normalize_percentages(text: &str, placeholder: &str) -> String {
851 PERCENTAGE_PATTERN
852 .replace_all(text, placeholder)
853 .to_string()
854}
855
856#[allow(dead_code)]
858pub fn normalize_ordinals(text: &str, placeholder: &str) -> String {
859 ORDINAL_PATTERN.replace_all(text, placeholder).to_string()
860}
861
862#[allow(dead_code)]
864pub fn normalize_dates(text: &str, placeholder: &str) -> String {
865 DATE_PATTERN.replace_all(text, placeholder).to_string()
866}
867
868#[allow(dead_code)]
870pub fn normalize_times(text: &str, placeholder: &str) -> String {
871 TIME_PATTERN.replace_all(text, placeholder).to_string()
872}
873
874#[allow(dead_code)]
876pub fn normalize_fractions(text: &str, placeholder: &str) -> String {
877 FRACTION_PATTERN.replace_all(text, placeholder).to_string()
878}
879
880#[allow(dead_code)]
882pub fn normalize_roman_numerals(text: &str, placeholder: &str) -> String {
883 ROMAN_NUMERAL_PATTERN
884 .replace_all(text, placeholder)
885 .to_string()
886}
887
888#[allow(dead_code)]
890pub fn normalize_scientific_notation(text: &str, placeholder: &str) -> String {
891 SCIENTIFIC_NOTATION_PATTERN
892 .replace_all(text, placeholder)
893 .to_string()
894}
895
896#[allow(dead_code)]
898pub fn normalize_temperatures(text: &str, placeholder: &str) -> String {
899 TEMPERATURE_PATTERN
900 .replace_all(text, placeholder)
901 .to_string()
902}
903
904#[allow(dead_code)]
906pub fn normalize_measurements(text: &str, placeholder: &str) -> String {
907 MEASUREMENT_PATTERN
908 .replace_all(text, placeholder)
909 .to_string()
910}
911
912#[allow(dead_code)]
914pub fn normalize_versions(text: &str, placeholder: &str) -> String {
915 VERSION_PATTERN.replace_all(text, placeholder).to_string()
916}
917
918#[allow(dead_code)]
920pub fn normalize_ip_addresses(text: &str, placeholder: &str) -> String {
921 IP_ADDRESS_PATTERN
922 .replace_all(text, placeholder)
923 .to_string()
924}
925
926#[allow(dead_code)]
928pub fn normalize_hex_numbers(text: &str, placeholder: &str) -> String {
929 HEX_PATTERN.replace_all(text, placeholder).to_string()
930}
931
932#[allow(dead_code)]
934pub fn normalize_binary_numbers(text: &str, placeholder: &str) -> String {
935 BINARY_PATTERN.replace_all(text, placeholder).to_string()
936}
937
938#[allow(dead_code)]
940pub fn normalize_all_numbers(text: &str, placeholder: &str) -> String {
941 let mut result = text.to_string();
942
943 result = normalize_scientific_notation(&result, placeholder);
945 result = normalize_temperatures(&result, placeholder);
946 result = normalize_measurements(&result, placeholder);
947 result = normalize_hex_numbers(&result, placeholder);
948 result = normalize_binary_numbers(&result, placeholder);
949 result = normalize_currencies(&result, placeholder);
950 result = normalize_percentages(&result, placeholder);
951 result = normalize_fractions(&result, placeholder);
952 result = normalize_ordinals(&result, placeholder);
953 result = normalize_versions(&result, placeholder);
954 result = normalize_ip_addresses(&result, placeholder);
955 result = normalize_roman_numerals(&result, placeholder);
956 result = normalize_numbers(&result, placeholder);
957
958 result
959}
960
961#[cfg(test)]
962mod tests {
963 use super::*;
964
965 #[test]
966 fn test_strip_html_tags() {
967 let html = "<p>Hello <b>world</b>!</p>";
968 let cleaned = strip_html_tags(html);
969 assert_eq!(cleaned, " Hello world ! ");
970 }
971
972 #[test]
973 fn test_replace_urls() {
974 let text = "Check out https://www.example.com for more info";
975 let replaced = replace_urls(text, "[URL]");
976 assert_eq!(replaced, "Check out [URL] for more info");
977 }
978
979 #[test]
980 fn test_replace_emails() {
981 let text = "Contact us at support@example.com for help";
982 let replaced = replace_emails(text, "[EMAIL]");
983 assert_eq!(replaced, "Contact us at [EMAIL] for help");
984 }
985
986 #[test]
987 fn test_expand_contractions() {
988 let text = "I can't believe it's working! They'll be happy.";
989 let expanded = expand_contractions(text);
990 assert_eq!(
991 expanded,
992 "I cannot believe it is working! They will be happy."
993 );
994 }
995
996 #[test]
997 fn test_normalize_whitespace() {
998 let text = " Hello world \n\t test ";
999 let normalized = normalize_whitespace(text);
1000 assert_eq!(normalized, "Hello world test");
1001 }
1002
1003 #[test]
1004 fn test_remove_accents() {
1005 let text = "Héllo wörld café";
1006 let cleaned = remove_accents(text);
1007 assert_eq!(cleaned, "Hello world cafe");
1008 }
1009
1010 #[test]
1011 fn test_advanced_cleaner() {
1012 let cleaner = AdvancedTextCleaner::new();
1013 let text = "<p>Check out https://example.com! Email: test@example.com</p>";
1014 let cleaned = cleaner.clean(text).unwrap();
1015 assert_eq!(cleaned, "check out [url]! email: [email]");
1016 }
1017
1018 #[test]
1019 fn test_privacy_focused_cleaner() {
1020 let cleaner = AdvancedTextCleaner::privacy_focused();
1021 let text = "Call me at (555) 123-4567 or email john@example.com";
1022 let cleaned = cleaner.clean(text).unwrap();
1023 assert_eq!(cleaned, "call me at [phone] or email [email]");
1024 }
1025
1026 #[test]
1027 fn test_normalize_numbers() {
1028 let text = "The price is 1,234.56 and the quantity is 42";
1029 let normalized = normalize_numbers(text, "[NUM]");
1030 assert_eq!(normalized, "The price is [NUM] and the quantity is [NUM]");
1031
1032 let text_scientific = "The value is 3.14e-10";
1033 let normalized_sci = normalize_numbers(text_scientific, "[NUM]");
1034 assert_eq!(normalized_sci, "The value is [NUM]");
1035 }
1036
1037 #[test]
1038 fn test_normalize_currencies() {
1039 let text = "The cost is $45.99 or €50.00";
1040 let normalized = normalize_currencies(text, "[MONEY]");
1041 assert_eq!(normalized, "The cost is [MONEY] or [MONEY]");
1042
1043 let text_words = "It costs 100 dollars or 85 euros";
1044 let normalized_words = normalize_currencies(text_words, "[MONEY]");
1045 assert_eq!(normalized_words, "It costs [MONEY] or [MONEY]");
1046 }
1047
1048 #[test]
1049 fn test_normalize_percentages() {
1050 let text = "The growth is 25% and the decline is -5.5%";
1051 let normalized = normalize_percentages(text, "[PCT]");
1052 assert_eq!(normalized, "The growth is [PCT] and the decline is [PCT]");
1053 }
1054
1055 #[test]
1056 fn test_normalize_ordinals() {
1057 let text = "He came 1st, she was 2nd, and they were 3rd";
1058 let normalized = normalize_ordinals(text, "[ORD]");
1059 assert_eq!(
1060 normalized,
1061 "He came [ORD], she was [ORD], and they were [ORD]"
1062 );
1063 }
1064
1065 #[test]
1066 fn test_number_normalization_in_cleaner() {
1067 let cleaner = AdvancedTextCleaner::new()
1068 .set_normalize_numbers(true)
1069 .set_normalize_currencies(true)
1070 .set_normalize_percentages(true)
1071 .set_normalize_ordinals(true);
1072
1073 let text = "The 1st item costs $99.99 with a 15% discount, total: 84.99";
1074 let cleaned = cleaner.clean(text).unwrap();
1075 assert_eq!(
1076 cleaned,
1077 "the [ordinal] item costs [currency] with a [percent] discount, total: [number]"
1078 );
1079 }
1080
1081 #[test]
1082 fn test_normalize_dates() {
1083 let text = "Meeting on 12/25/2023, or December 25, 2023, or 25 December 2023";
1084 let normalized = normalize_dates(text, "[DATE]");
1085 assert_eq!(normalized, "Meeting on [DATE], or [DATE], or [DATE]");
1086 }
1087
1088 #[test]
1089 fn test_normalize_times() {
1090 let text = "Meeting at 14:30 or 2:30 PM or 09:00:15";
1091 let normalized = normalize_times(text, "[TIME]");
1092 assert_eq!(normalized, "Meeting at [TIME] or [TIME] or [TIME]");
1093 }
1094
1095 #[test]
1096 fn test_normalize_fractions() {
1097 let text = "Mix 1/2 cup flour with 2 3/4 cups sugar";
1098 let normalized = normalize_fractions(text, "[FRACTION]");
1099 assert_eq!(
1100 normalized,
1101 "Mix [FRACTION] cup flour with [FRACTION] cups sugar"
1102 );
1103 }
1104
1105 #[test]
1106 fn test_normalize_roman_numerals() {
1107 let text = "Chapter IV discusses Section XVII and Part III";
1108 let normalized = normalize_roman_numerals(text, "[ROMAN]");
1109 assert_eq!(
1110 normalized,
1111 "Chapter [ROMAN] discusses Section [ROMAN] and Part [ROMAN]"
1112 );
1113 }
1114
1115 #[test]
1116 fn test_normalize_scientific_notation() {
1117 let text = "Value is 6.022e23 or 1.23E-10";
1118 let normalized = normalize_scientific_notation(text, "[SCI]");
1119 assert_eq!(normalized, "Value is [SCI] or [SCI]");
1120 }
1121
1122 #[test]
1123 fn test_normalize_temperatures() {
1124 let text = "Temperature is 25°C or 77°F or 298K degrees kelvin";
1125 let normalized = normalize_temperatures(text, "[TEMP]");
1126 assert_eq!(normalized, "Temperature is [TEMP] or [TEMP] or [TEMP]");
1127 }
1128
1129 #[test]
1130 fn test_normalize_measurements() {
1131 let text = "Distance: 5km, weight: 2.5kg, speed: 60mph, storage: 1TB";
1132 let normalized = normalize_measurements(text, "[MEASURE]");
1133 assert_eq!(
1134 normalized,
1135 "Distance: [MEASURE], weight: [MEASURE], speed: [MEASURE], storage: [MEASURE]"
1136 );
1137 }
1138
1139 #[test]
1140 fn test_normalize_versions() {
1141 let text = "Using Python v3.9.1 and Node.js 16.14.0-alpha1";
1142 let normalized = normalize_versions(text, "[VER]");
1143 assert_eq!(normalized, "Using Python [VER] and Node.js [VER]");
1144 }
1145
1146 #[test]
1147 fn test_normalize_ip_addresses() {
1148 let text = "Server at 192.168.1.1 and backup at 10.0.0.255";
1149 let normalized = normalize_ip_addresses(text, "[IP]");
1150 assert_eq!(normalized, "Server at [IP] and backup at [IP]");
1151 }
1152
1153 #[test]
1154 fn test_normalize_hex_numbers() {
1155 let text = "Color #FF5733 or address 0x1A2B3C4D";
1156 let normalized = normalize_hex_numbers(text, "[HEX]");
1157 assert_eq!(normalized, "Color [HEX] or address [HEX]");
1158 }
1159
1160 #[test]
1161 fn test_normalize_binary_numbers() {
1162 let text = "Binary values: 0b1010 and 0b11110000";
1163 let normalized = normalize_binary_numbers(text, "[BIN]");
1164 assert_eq!(normalized, "Binary values: [BIN] and [BIN]");
1165 }
1166
1167 #[test]
1168 fn test_enhanced_currency_normalization() {
1169 let text = "Cost is $100.50, €75.25, ¥10000, or 50 USD";
1170 let cleaner = AdvancedTextCleaner::new().set_normalize_currencies(true);
1171 let cleaned = cleaner.clean(text).unwrap();
1172 assert_eq!(
1173 cleaned,
1174 "cost is [currency], [currency], [currency], or [currency]"
1175 );
1176 }
1177
1178 #[test]
1179 fn test_comprehensive_advanced_normalization() {
1180 let cleaner = AdvancedTextCleaner::new()
1181 .set_normalize_dates(true)
1182 .set_normalize_times(true)
1183 .set_normalize_fractions(true)
1184 .set_normalize_scientific_notation(true)
1185 .set_normalize_temperatures(true)
1186 .set_normalize_measurements(true)
1187 .set_normalize_versions(true)
1188 .set_normalize_ip_addresses(true)
1189 .set_normalize_hex_numbers(true)
1190 .set_normalize_binary_numbers(true);
1191
1192 let text = "On 12/25/2023 at 14:30, server 192.168.1.1 v2.1.0 measured 25°C, processed 0xFF data with 1/2 efficiency, used 6.022e23 molecules";
1193 let cleaned = cleaner.clean(text).unwrap();
1194
1195 assert!(cleaned.contains("[date]"));
1197 assert!(cleaned.contains("[time]"));
1198 assert!(cleaned.contains("[ip]"));
1199 assert!(cleaned.contains("[version]"));
1200 assert!(cleaned.contains("[temp]"));
1201 assert!(cleaned.contains("[hex]"));
1202 assert!(cleaned.contains("[fraction]"));
1203 assert!(cleaned.contains("[scientific]"));
1204 }
1205
1206 #[test]
1207 fn test_privacy_focused_cleaner_with_advanced_features() {
1208 let cleaner = AdvancedTextCleaner::privacy_focused();
1209 let text = "Meeting on 01/15/2024 at 14:30, contact john@example.com or call (555) 123-4567, server: 192.168.1.100";
1210 let cleaned = cleaner.clean(text).unwrap();
1211
1212 assert_eq!(
1214 cleaned,
1215 "meeting on [date] at [time], contact [email] or call [phone], server: [ip]"
1216 );
1217 }
1218
1219 #[test]
1220 fn test_normalize_all_numbers_function() {
1221 let text = "Value: 3.14e-10, temp: 25°C, price: $99.99, percent: 15%, ordinal: 1st, fraction: 1/2, hex: 0xFF, binary: 0b1010, IP: 192.168.1.1, version: v1.2.3, roman: IV";
1222 let normalized = normalize_all_numbers(text, "[NUM]");
1223
1224 assert_eq!(normalized, "Value: [NUM], temp: [NUM], price: [NUM], percent: [NUM], ordinal: [NUM], fraction: [NUM], hex: [NUM], binary: [NUM], IP: [NUM], version: [NUM], roman: [NUM]");
1226 }
1227
1228 #[test]
1229 fn test_advanced_placeholder_customization() {
1230 let cleaner = AdvancedTextCleaner::new()
1231 .set_normalize_dates(true)
1232 .set_normalize_temperatures(true)
1233 .set_normalize_hex_numbers(true)
1234 .set_advanced_placeholders(
1235 Some("[CUSTOM_DATE]".to_string()),
1236 None,
1237 None,
1238 None,
1239 None,
1240 Some("[CUSTOM_TEMP]".to_string()),
1241 None,
1242 None,
1243 None,
1244 Some("[CUSTOM_HEX]".to_string()),
1245 None,
1246 );
1247
1248 let text = "Date: 12/25/2023, temp: 25°C, color: #FF0000";
1249 let cleaned = cleaner.clean(text).unwrap();
1250 assert_eq!(
1251 cleaned,
1252 "date: [custom_date], temp: [custom_temp], color: [custom_hex]"
1253 );
1254 }
1255}