scirs2_text/
cleansing.rs

1//! Advanced text cleansing utilities
2//!
3//! This module provides functions for cleaning text data including
4//! HTML stripping, URL handling, and various normalization operations.
5
6use crate::error::Result;
7use lazy_static::lazy_static;
8use regex::Regex;
9use std::collections::HashMap;
10
11lazy_static! {
12    // HTML/XML tag pattern
13    static ref HTML_TAG_PATTERN: Regex = Regex::new(r"<[^>]+>").unwrap();
14
15    // URL pattern
16    static ref URL_PATTERN: Regex = Regex::new(
17        r"(?i)https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&/=]*)"
18    ).unwrap();
19
20    // Email pattern
21    static ref EMAIL_PATTERN: Regex = Regex::new(
22        r"(?i)[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
23    ).unwrap();
24
25    // Phone number pattern (simplified, US-style)
26    static ref PHONE_PATTERN: Regex = Regex::new(
27        r"(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})"
28    ).unwrap();
29
30    // Common contractions mapping
31    static ref CONTRACTIONS: HashMap<&'static str, &'static str> = {
32        let mut m = HashMap::new();
33        m.insert("can't", "cannot");
34        m.insert("won't", "will not");
35        m.insert("n't", " not");
36        m.insert("'re", " are");
37        m.insert("'ve", " have");
38        m.insert("'ll", " will");
39        m.insert("'d", " would");
40        m.insert("'m", " am");
41        m.insert("'s", " is");
42        m.insert("let's", "let us");
43        m.insert("it's", "it is");
44        m.insert("that's", "that is");
45        m.insert("what's", "what is");
46        m.insert("where's", "where is");
47        m.insert("who's", "who is");
48        m.insert("there's", "there is");
49        m.insert("here's", "here is");
50        m
51    };
52
53    // Emoji pattern
54    static ref EMOJI_PATTERN: Regex = Regex::new(
55        concat!(
56            "[",
57            "\u{1F600}-\u{1F64F}", // Emoticons
58            "\u{1F300}-\u{1F5FF}", // Symbols & Pictographs
59            "\u{1F680}-\u{1F6FF}", // Transport & Map
60            "\u{1F700}-\u{1F77F}", // Alchemical Symbols
61            "\u{1F780}-\u{1F7FF}", // Geometric Shapes Extended
62            "\u{1F800}-\u{1F8FF}", // Supplemental Arrows-C
63            "\u{2600}-\u{26FF}",   // Miscellaneous Symbols
64            "\u{2700}-\u{27BF}",   // Dingbats
65            "]"
66        )
67    ).unwrap();
68
69    // Number patterns for normalization
70    static ref NUMBER_PATTERN: Regex = Regex::new(
71        r"(?i)\b[-+]?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?(?:[eE][-+]?\d+)?\b"
72    ).unwrap();
73
74    // Currency pattern
75    static ref CURRENCY_PATTERN: Regex = Regex::new(
76        r"(?i)(?:[$€£¥₹])[ \t]*(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?|(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?[ \t]*(?:dollars?|euros?|pounds?|yen|rupees?|USD|EUR|GBP|JPY|INR)\b"
77    ).unwrap();
78
79    // Percentage pattern
80    static ref PERCENTAGE_PATTERN: Regex = Regex::new(
81        r"(?i)[-+]?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?%"
82    ).unwrap();
83
84    // Ordinal pattern
85    static ref ORDINAL_PATTERN: Regex = Regex::new(
86        r"(?i)\b(\d+)(?:st|nd|rd|th)\b"
87    ).unwrap();
88
89    // Advanced number patterns for enhanced normalization
90
91    // Date patterns (various formats)
92    static ref DATE_PATTERN: Regex = Regex::new(
93        r"(?i)\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:19|20)?\d{2})|(?:(?:19|20)\d{2}[/-](?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01]))|(?:(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{1,2},?\s+\d{4})|(?:\d{1,2}\s+(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{4})\b"
94    ).unwrap();
95
96    // Time patterns
97    static ref TIME_PATTERN: Regex = Regex::new(
98        r"(?i)\b(?:[01]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9])?(?:\s*[aApP][mM])?\b"
99    ).unwrap();
100
101    // Fraction patterns
102    static ref FRACTION_PATTERN: Regex = Regex::new(
103        r"\b\d+\s*/\s*\d+\b|\b\d+\s+\d+\s*/\s*\d+\b"
104    ).unwrap();
105
106    // Roman numeral patterns
107    static ref ROMAN_NUMERAL_PATTERN: Regex = Regex::new(
108        r"(?i)\b[MDCLXVI]+\b"
109    ).unwrap();
110
111    // Scientific notation (enhanced)
112    static ref SCIENTIFIC_NOTATION_PATTERN: Regex = Regex::new(
113        r"(?i)[-+]?(?:\d+\.?\d*|\.\d+)[eE][-+]?\d+"
114    ).unwrap();
115
116    // Temperature patterns
117    static ref TEMPERATURE_PATTERN: Regex = Regex::new(
118        r"(?i)[-+]?(?:\d+\.?\d*|\.\d+)\s*(?:°[CFK]|[CFK](?:\s+degrees?)?\s*(?:celsius|fahrenheit|kelvin)?|degrees?\s+(?:celsius|fahrenheit|kelvin))\b"
119    ).unwrap();
120
121    // Measurement unit patterns
122    static ref MEASUREMENT_PATTERN: Regex = Regex::new(
123        r"(?i)[-+]?(?:\d+\.?\d*|\.\d+)\s*(?:mm|cm|m|km|in|ft|yd|mi|g|kg|lb|oz|ml|l|gal|mph|kph|Hz|kHz|MHz|GHz|KB|MB|GB|TB|°|rad|sq|cu)\b"
124    ).unwrap();
125
126    // Enhanced currency pattern with more currencies and formats
127    static ref ENHANCED_CURRENCY_PATTERN: Regex = Regex::new(
128        r"(?i)(?:[$€£¥₹₽₩¢₪₨₦₴₵₡₲₱₫₭₦₨]\s*\d+(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:,\d{3})*(?:\.\d{1,2})?\s*(?:USD|EUR|GBP|JPY|INR|RUB|KRW|CNY|CAD|AUD|CHF|SGD|dollars?|euros?|pounds?|yen|rupees?|yuan|won|rubles?))\b"
129    ).unwrap();
130
131    // Version numbers
132    static ref VERSION_PATTERN: Regex = Regex::new(
133        r"\bv?\d+(?:\.\d+){1,3}(?:-[a-zA-Z]+\d*)?\b"
134    ).unwrap();
135
136    // IP addresses
137    static ref IP_ADDRESS_PATTERN: Regex = Regex::new(
138        r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"
139    ).unwrap();
140
141    // Hexadecimal numbers
142    static ref HEX_PATTERN: Regex = Regex::new(
143        r"(?i)\b0x[0-9a-f]+\b|#[0-9a-f]{3,8}\b"
144    ).unwrap();
145
146    // Binary numbers
147    static ref BINARY_PATTERN: Regex = Regex::new(
148        r"\b0b[01]+\b"
149    ).unwrap();
150}
151
152/// Advanced text cleaner with various cleaning options
153#[derive(Debug, Clone)]
154pub struct AdvancedTextCleaner {
155    strip_html: bool,
156    replace_urls: bool,
157    replace_emails: bool,
158    replace_phone_numbers: bool,
159    expand_contractions: bool,
160    remove_emojis: bool,
161    normalize_unicode: bool,
162    preserve_case: bool,
163    normalize_numbers: bool,
164    normalize_currencies: bool,
165    normalize_percentages: bool,
166    normalize_ordinals: bool,
167    normalize_dates: bool,
168    normalize_times: bool,
169    normalize_fractions: bool,
170    normalize_roman_numerals: bool,
171    normalize_scientific_notation: bool,
172    normalize_temperatures: bool,
173    normalize_measurements: bool,
174    normalize_versions: bool,
175    normalize_ip_addresses: bool,
176    normalize_hex_numbers: bool,
177    normalize_binary_numbers: bool,
178    url_placeholder: String,
179    email_placeholder: String,
180    phone_placeholder: String,
181    number_placeholder: String,
182    currency_placeholder: String,
183    percentage_placeholder: String,
184    ordinal_placeholder: String,
185    date_placeholder: String,
186    time_placeholder: String,
187    fraction_placeholder: String,
188    roman_placeholder: String,
189    scientific_placeholder: String,
190    temperature_placeholder: String,
191    measurement_placeholder: String,
192    version_placeholder: String,
193    ip_placeholder: String,
194    hex_placeholder: String,
195    binary_placeholder: String,
196}
197
198impl AdvancedTextCleaner {
199    /// Create a new advanced text cleaner with default settings
200    pub fn new() -> Self {
201        Self {
202            strip_html: true,
203            replace_urls: true,
204            replace_emails: true,
205            replace_phone_numbers: false,
206            expand_contractions: true,
207            remove_emojis: false,
208            normalize_unicode: true,
209            preserve_case: false,
210            normalize_numbers: false,
211            normalize_currencies: false,
212            normalize_percentages: false,
213            normalize_ordinals: false,
214            normalize_dates: false,
215            normalize_times: false,
216            normalize_fractions: false,
217            normalize_roman_numerals: false,
218            normalize_scientific_notation: false,
219            normalize_temperatures: false,
220            normalize_measurements: false,
221            normalize_versions: false,
222            normalize_ip_addresses: false,
223            normalize_hex_numbers: false,
224            normalize_binary_numbers: false,
225            url_placeholder: "[URL]".to_string(),
226            email_placeholder: "[EMAIL]".to_string(),
227            phone_placeholder: "[PHONE]".to_string(),
228            number_placeholder: "[NUMBER]".to_string(),
229            currency_placeholder: "[CURRENCY]".to_string(),
230            percentage_placeholder: "[PERCENT]".to_string(),
231            ordinal_placeholder: "[ORDINAL]".to_string(),
232            date_placeholder: "[DATE]".to_string(),
233            time_placeholder: "[TIME]".to_string(),
234            fraction_placeholder: "[FRACTION]".to_string(),
235            roman_placeholder: "[ROMAN]".to_string(),
236            scientific_placeholder: "[SCIENTIFIC]".to_string(),
237            temperature_placeholder: "[TEMP]".to_string(),
238            measurement_placeholder: "[MEASURE]".to_string(),
239            version_placeholder: "[VERSION]".to_string(),
240            ip_placeholder: "[IP]".to_string(),
241            hex_placeholder: "[HEX]".to_string(),
242            binary_placeholder: "[BINARY]".to_string(),
243        }
244    }
245
246    /// Create a cleaner for privacy-sensitive text
247    pub fn privacy_focused() -> Self {
248        Self {
249            strip_html: true,
250            replace_urls: true,
251            replace_emails: true,
252            replace_phone_numbers: true,
253            expand_contractions: true,
254            remove_emojis: false,
255            normalize_unicode: true,
256            preserve_case: false,
257            normalize_numbers: false,
258            normalize_currencies: false,
259            normalize_percentages: false,
260            normalize_ordinals: false,
261            normalize_dates: true, // Privacy-focused normalizes dates
262            normalize_times: true, // and times
263            normalize_fractions: false,
264            normalize_roman_numerals: false,
265            normalize_scientific_notation: false,
266            normalize_temperatures: false,
267            normalize_measurements: false,
268            normalize_versions: false,
269            normalize_ip_addresses: true, // Privacy-focused normalizes IPs
270            normalize_hex_numbers: false,
271            normalize_binary_numbers: false,
272            url_placeholder: "[URL]".to_string(),
273            email_placeholder: "[EMAIL]".to_string(),
274            phone_placeholder: "[PHONE]".to_string(),
275            number_placeholder: "[NUMBER]".to_string(),
276            currency_placeholder: "[CURRENCY]".to_string(),
277            percentage_placeholder: "[PERCENT]".to_string(),
278            ordinal_placeholder: "[ORDINAL]".to_string(),
279            date_placeholder: "[DATE]".to_string(),
280            time_placeholder: "[TIME]".to_string(),
281            fraction_placeholder: "[FRACTION]".to_string(),
282            roman_placeholder: "[ROMAN]".to_string(),
283            scientific_placeholder: "[SCIENTIFIC]".to_string(),
284            temperature_placeholder: "[TEMP]".to_string(),
285            measurement_placeholder: "[MEASURE]".to_string(),
286            version_placeholder: "[VERSION]".to_string(),
287            ip_placeholder: "[IP]".to_string(),
288            hex_placeholder: "[HEX]".to_string(),
289            binary_placeholder: "[BINARY]".to_string(),
290        }
291    }
292
293    /// Create a minimal cleaner that preserves most content
294    pub fn minimal() -> Self {
295        Self {
296            strip_html: true,
297            replace_urls: false,
298            replace_emails: false,
299            replace_phone_numbers: false,
300            expand_contractions: false,
301            remove_emojis: false,
302            normalize_unicode: true,
303            preserve_case: true,
304            normalize_numbers: false,
305            normalize_currencies: false,
306            normalize_percentages: false,
307            normalize_ordinals: false,
308            normalize_dates: false,
309            normalize_times: false,
310            normalize_fractions: false,
311            normalize_roman_numerals: false,
312            normalize_scientific_notation: false,
313            normalize_temperatures: false,
314            normalize_measurements: false,
315            normalize_versions: false,
316            normalize_ip_addresses: false,
317            normalize_hex_numbers: false,
318            normalize_binary_numbers: false,
319            url_placeholder: "[URL]".to_string(),
320            email_placeholder: "[EMAIL]".to_string(),
321            phone_placeholder: "[PHONE]".to_string(),
322            number_placeholder: "[NUMBER]".to_string(),
323            currency_placeholder: "[CURRENCY]".to_string(),
324            percentage_placeholder: "[PERCENT]".to_string(),
325            ordinal_placeholder: "[ORDINAL]".to_string(),
326            date_placeholder: "[DATE]".to_string(),
327            time_placeholder: "[TIME]".to_string(),
328            fraction_placeholder: "[FRACTION]".to_string(),
329            roman_placeholder: "[ROMAN]".to_string(),
330            scientific_placeholder: "[SCIENTIFIC]".to_string(),
331            temperature_placeholder: "[TEMP]".to_string(),
332            measurement_placeholder: "[MEASURE]".to_string(),
333            version_placeholder: "[VERSION]".to_string(),
334            ip_placeholder: "[IP]".to_string(),
335            hex_placeholder: "[HEX]".to_string(),
336            binary_placeholder: "[BINARY]".to_string(),
337        }
338    }
339
340    /// Set whether to strip HTML tags
341    pub fn set_strip_html(mut self, value: bool) -> Self {
342        self.strip_html = value;
343        self
344    }
345
346    /// Set whether to replace URLs
347    pub fn set_replace_urls(mut self, value: bool) -> Self {
348        self.replace_urls = value;
349        self
350    }
351
352    /// Set whether to replace emails
353    pub fn set_replace_emails(mut self, value: bool) -> Self {
354        self.replace_emails = value;
355        self
356    }
357
358    /// Set whether to replace phone numbers
359    pub fn set_replace_phone_numbers(mut self, value: bool) -> Self {
360        self.replace_phone_numbers = value;
361        self
362    }
363
364    /// Set whether to expand contractions
365    pub fn set_expand_contractions(mut self, value: bool) -> Self {
366        self.expand_contractions = value;
367        self
368    }
369
370    /// Set whether to remove emojis
371    pub fn set_remove_emojis(mut self, value: bool) -> Self {
372        self.remove_emojis = value;
373        self
374    }
375
376    /// Set whether to normalize numbers
377    pub fn set_normalize_numbers(mut self, value: bool) -> Self {
378        self.normalize_numbers = value;
379        self
380    }
381
382    /// Set whether to normalize currencies
383    pub fn set_normalize_currencies(mut self, value: bool) -> Self {
384        self.normalize_currencies = value;
385        self
386    }
387
388    /// Set whether to normalize percentages
389    pub fn set_normalize_percentages(mut self, value: bool) -> Self {
390        self.normalize_percentages = value;
391        self
392    }
393
394    /// Set whether to normalize ordinals
395    pub fn set_normalize_ordinals(mut self, value: bool) -> Self {
396        self.normalize_ordinals = value;
397        self
398    }
399
400    /// Set whether to normalize dates
401    pub fn set_normalize_dates(mut self, value: bool) -> Self {
402        self.normalize_dates = value;
403        self
404    }
405
406    /// Set whether to normalize times
407    pub fn set_normalize_times(mut self, value: bool) -> Self {
408        self.normalize_times = value;
409        self
410    }
411
412    /// Set whether to normalize fractions
413    pub fn set_normalize_fractions(mut self, value: bool) -> Self {
414        self.normalize_fractions = value;
415        self
416    }
417
418    /// Set whether to normalize roman numerals
419    pub fn set_normalize_roman_numerals(mut self, value: bool) -> Self {
420        self.normalize_roman_numerals = value;
421        self
422    }
423
424    /// Set whether to normalize scientific notation
425    pub fn set_normalize_scientific_notation(mut self, value: bool) -> Self {
426        self.normalize_scientific_notation = value;
427        self
428    }
429
430    /// Set whether to normalize temperatures
431    pub fn set_normalize_temperatures(mut self, value: bool) -> Self {
432        self.normalize_temperatures = value;
433        self
434    }
435
436    /// Set whether to normalize measurements
437    pub fn set_normalize_measurements(mut self, value: bool) -> Self {
438        self.normalize_measurements = value;
439        self
440    }
441
442    /// Set whether to normalize version numbers
443    pub fn set_normalize_versions(mut self, value: bool) -> Self {
444        self.normalize_versions = value;
445        self
446    }
447
448    /// Set whether to normalize IP addresses
449    pub fn set_normalize_ip_addresses(mut self, value: bool) -> Self {
450        self.normalize_ip_addresses = value;
451        self
452    }
453
454    /// Set whether to normalize hexadecimal numbers
455    pub fn set_normalize_hex_numbers(mut self, value: bool) -> Self {
456        self.normalize_hex_numbers = value;
457        self
458    }
459
460    /// Set whether to normalize binary numbers
461    pub fn set_normalize_binary_numbers(mut self, value: bool) -> Self {
462        self.normalize_binary_numbers = value;
463        self
464    }
465
466    /// Set custom placeholders
467    pub fn set_placeholders(
468        mut self,
469        url: Option<String>,
470        email: Option<String>,
471        phone: Option<String>,
472    ) -> Self {
473        if let Some(u) = url {
474            self.url_placeholder = u;
475        }
476        if let Some(e) = email {
477            self.email_placeholder = e;
478        }
479        if let Some(p) = phone {
480            self.phone_placeholder = p;
481        }
482        self
483    }
484
485    /// Set custom number placeholders
486    pub fn set_number_placeholders(
487        mut self,
488        number: Option<String>,
489        currency: Option<String>,
490        percentage: Option<String>,
491        ordinal: Option<String>,
492    ) -> Self {
493        if let Some(n) = number {
494            self.number_placeholder = n;
495        }
496        if let Some(c) = currency {
497            self.currency_placeholder = c;
498        }
499        if let Some(p) = percentage {
500            self.percentage_placeholder = p;
501        }
502        if let Some(o) = ordinal {
503            self.ordinal_placeholder = o;
504        }
505        self
506    }
507
508    /// Set custom advanced placeholders for new normalization types
509    pub fn set_advanced_placeholders(
510        mut self,
511        date: Option<String>,
512        time: Option<String>,
513        fraction: Option<String>,
514        roman: Option<String>,
515        scientific: Option<String>,
516        temperature: Option<String>,
517        measurement: Option<String>,
518        version: Option<String>,
519        ip: Option<String>,
520        hex: Option<String>,
521        binary: Option<String>,
522    ) -> Self {
523        if let Some(d) = date {
524            self.date_placeholder = d;
525        }
526        if let Some(t) = time {
527            self.time_placeholder = t;
528        }
529        if let Some(f) = fraction {
530            self.fraction_placeholder = f;
531        }
532        if let Some(r) = roman {
533            self.roman_placeholder = r;
534        }
535        if let Some(s) = scientific {
536            self.scientific_placeholder = s;
537        }
538        if let Some(temp) = temperature {
539            self.temperature_placeholder = temp;
540        }
541        if let Some(m) = measurement {
542            self.measurement_placeholder = m;
543        }
544        if let Some(v) = version {
545            self.version_placeholder = v;
546        }
547        if let Some(i) = ip {
548            self.ip_placeholder = i;
549        }
550        if let Some(h) = hex {
551            self.hex_placeholder = h;
552        }
553        if let Some(b) = binary {
554            self.binary_placeholder = b;
555        }
556        self
557    }
558
559    /// Clean the text according to the configured options
560    pub fn clean(&self, text: &str) -> Result<String> {
561        let mut cleaned = text.to_string();
562
563        // Strip HTML tags
564        if self.strip_html {
565            cleaned = strip_html_tags(&cleaned);
566        }
567
568        // Replace URLs
569        if self.replace_urls {
570            cleaned = URL_PATTERN
571                .replace_all(&cleaned, &self.url_placeholder)
572                .to_string();
573        }
574
575        // Replace emails
576        if self.replace_emails {
577            cleaned = EMAIL_PATTERN
578                .replace_all(&cleaned, &self.email_placeholder)
579                .to_string();
580        }
581
582        // Replace phone numbers
583        if self.replace_phone_numbers {
584            cleaned = PHONE_PATTERN
585                .replace_all(&cleaned, &self.phone_placeholder)
586                .to_string();
587        }
588
589        // Expand contractions
590        if self.expand_contractions {
591            cleaned = expand_contractions(&cleaned);
592        }
593
594        // Remove emojis
595        if self.remove_emojis {
596            cleaned = EMOJI_PATTERN.replace_all(&cleaned, " ").to_string();
597        }
598
599        // Normalize specialized patterns first (before general numbers)
600
601        // Normalize dates
602        if self.normalize_dates {
603            cleaned = DATE_PATTERN
604                .replace_all(&cleaned, &self.date_placeholder)
605                .to_string();
606        }
607
608        // Normalize times
609        if self.normalize_times {
610            cleaned = TIME_PATTERN
611                .replace_all(&cleaned, &self.time_placeholder)
612                .to_string();
613        }
614
615        // Normalize IP addresses (before general numbers)
616        if self.normalize_ip_addresses {
617            cleaned = IP_ADDRESS_PATTERN
618                .replace_all(&cleaned, &self.ip_placeholder)
619                .to_string();
620        }
621
622        // Normalize version numbers (before general numbers)
623        if self.normalize_versions {
624            cleaned = VERSION_PATTERN
625                .replace_all(&cleaned, &self.version_placeholder)
626                .to_string();
627        }
628
629        // Normalize scientific notation (before general numbers)
630        if self.normalize_scientific_notation {
631            cleaned = SCIENTIFIC_NOTATION_PATTERN
632                .replace_all(&cleaned, &self.scientific_placeholder)
633                .to_string();
634        }
635
636        // Normalize temperatures (before general numbers)
637        if self.normalize_temperatures {
638            cleaned = TEMPERATURE_PATTERN
639                .replace_all(&cleaned, &self.temperature_placeholder)
640                .to_string();
641        }
642
643        // Normalize measurements (before general numbers)
644        if self.normalize_measurements {
645            cleaned = MEASUREMENT_PATTERN
646                .replace_all(&cleaned, &self.measurement_placeholder)
647                .to_string();
648        }
649
650        // Normalize hexadecimal numbers (before general numbers)
651        if self.normalize_hex_numbers {
652            cleaned = HEX_PATTERN
653                .replace_all(&cleaned, &self.hex_placeholder)
654                .to_string();
655        }
656
657        // Normalize binary numbers (before general numbers)
658        if self.normalize_binary_numbers {
659            cleaned = BINARY_PATTERN
660                .replace_all(&cleaned, &self.binary_placeholder)
661                .to_string();
662        }
663
664        // Normalize enhanced currencies (before general currencies)
665        if self.normalize_currencies {
666            cleaned = ENHANCED_CURRENCY_PATTERN
667                .replace_all(&cleaned, &self.currency_placeholder)
668                .to_string();
669        }
670
671        // Normalize fractions (before general numbers)
672        if self.normalize_fractions {
673            cleaned = FRACTION_PATTERN
674                .replace_all(&cleaned, &self.fraction_placeholder)
675                .to_string();
676        }
677
678        // Normalize roman numerals (before general numbers)
679        if self.normalize_roman_numerals {
680            cleaned = ROMAN_NUMERAL_PATTERN
681                .replace_all(&cleaned, &self.roman_placeholder)
682                .to_string();
683        }
684
685        // Normalize percentages (before general numbers)
686        if self.normalize_percentages {
687            cleaned = PERCENTAGE_PATTERN
688                .replace_all(&cleaned, &self.percentage_placeholder)
689                .to_string();
690        }
691
692        // Normalize ordinals (before general numbers)
693        if self.normalize_ordinals {
694            cleaned = ORDINAL_PATTERN
695                .replace_all(&cleaned, &self.ordinal_placeholder)
696                .to_string();
697        }
698
699        // Normalize general numbers
700        if self.normalize_numbers {
701            cleaned = NUMBER_PATTERN
702                .replace_all(&cleaned, &self.number_placeholder)
703                .to_string();
704        }
705
706        // Normalize unicode
707        if self.normalize_unicode {
708            cleaned = normalize_unicode(&cleaned)?;
709        }
710
711        // Handle case
712        if !self.preserve_case {
713            cleaned = cleaned.to_lowercase();
714        }
715
716        // Normalize whitespace
717        cleaned = normalize_whitespace(&cleaned);
718
719        Ok(cleaned)
720    }
721}
722
723impl Default for AdvancedTextCleaner {
724    fn default() -> Self {
725        Self::new()
726    }
727}
728
729/// Strip HTML and XML tags from text
730#[allow(dead_code)]
731pub fn strip_html_tags(text: &str) -> String {
732    HTML_TAG_PATTERN.replace_all(text, " ").to_string()
733}
734
735/// Replace URLs with a placeholder
736#[allow(dead_code)]
737pub fn replace_urls(text: &str, placeholder: &str) -> String {
738    URL_PATTERN.replace_all(text, placeholder).to_string()
739}
740
741/// Replace email addresses with a placeholder
742#[allow(dead_code)]
743pub fn replace_emails(text: &str, placeholder: &str) -> String {
744    EMAIL_PATTERN.replace_all(text, placeholder).to_string()
745}
746
747/// Replace phone numbers with a placeholder
748#[allow(dead_code)]
749pub fn replace_phone_numbers(text: &str, placeholder: &str) -> String {
750    PHONE_PATTERN.replace_all(text, placeholder).to_string()
751}
752
753/// Expand common contractions
754#[allow(dead_code)]
755pub fn expand_contractions(text: &str) -> String {
756    let mut result = text.to_string();
757
758    // Sort contractions by length (descending) to avoid partial replacements
759    let mut contractions: Vec<_> = CONTRACTIONS.iter().collect();
760    contractions.sort_by_key(|(k_, _)| std::cmp::Reverse(k_.len()));
761
762    for (contraction, expansion) in contractions {
763        let escaped = regex::escape(contraction);
764        let pattern = format!(r"\b{escaped}\b");
765        if let Ok(re) = Regex::new(&pattern) {
766            result = re.replace_all(&result, *expansion).to_string();
767        }
768    }
769
770    result
771}
772
773/// Normalize Unicode text (NFD -> NFC)
774#[allow(dead_code)]
775pub fn normalize_unicode(text: &str) -> Result<String> {
776    use unicode_normalization::UnicodeNormalization;
777    Ok(text.nfc().collect())
778}
779
780/// Normalize whitespace (multiple spaces to single space, trim)
781#[allow(dead_code)]
782pub fn normalize_whitespace(text: &str) -> String {
783    #[cfg(feature = "simd")]
784    {
785        // Use SIMD to find whitespace positions for ASCII text
786        if text.is_ascii() && crate::simd_ops::SimdStringOps::is_available() {
787            let positions = crate::simd_ops::SimdStringOps::find_whitespace_positions(text);
788            if positions.is_empty() {
789                return text.trim().to_string();
790            }
791            // Fall through to regex-based approach for complex whitespace normalization
792        }
793    }
794
795    lazy_static! {
796        static ref WHITESPACE_PATTERN: Regex = Regex::new(r"\s+").unwrap();
797    }
798
799    WHITESPACE_PATTERN.replace_all(text.trim(), " ").to_string()
800}
801
802/// Remove accents from text
803#[allow(dead_code)]
804pub fn remove_accents(text: &str) -> String {
805    use unicode_normalization::UnicodeNormalization;
806
807    text.nfd()
808        .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
809        .collect()
810}
811
812/// Convert various dash types to regular hyphen
813#[allow(dead_code)]
814pub fn normalize_dashes(text: &str) -> String {
815    lazy_static! {
816        static ref DASH_PATTERN: Regex = Regex::new(r"[\u{2010}-\u{2015}\u{2212}]").unwrap();
817    }
818
819    DASH_PATTERN.replace_all(text, "-").to_string()
820}
821
822/// Convert various quote types to regular quotes
823#[allow(dead_code)]
824pub fn normalize_quotes(text: &str) -> String {
825    lazy_static! {
826        static ref SINGLE_QUOTE_PATTERN: Regex =
827            Regex::new(r"[\u{2018}\u{2019}\u{201A}\u{201B}]").unwrap();
828        static ref DOUBLE_QUOTE_PATTERN: Regex =
829            Regex::new(r"[\u{201C}\u{201D}\u{201E}\u{201F}]").unwrap();
830    }
831
832    let text = SINGLE_QUOTE_PATTERN.replace_all(text, "'");
833    DOUBLE_QUOTE_PATTERN.replace_all(&text, "\"").to_string()
834}
835
836/// Normalize numbers in text
837#[allow(dead_code)]
838pub fn normalize_numbers(text: &str, placeholder: &str) -> String {
839    NUMBER_PATTERN.replace_all(text, placeholder).to_string()
840}
841
842/// Normalize currency values in text
843#[allow(dead_code)]
844pub fn normalize_currencies(text: &str, placeholder: &str) -> String {
845    CURRENCY_PATTERN.replace_all(text, placeholder).to_string()
846}
847
848/// Normalize percentage values in text
849#[allow(dead_code)]
850pub fn normalize_percentages(text: &str, placeholder: &str) -> String {
851    PERCENTAGE_PATTERN
852        .replace_all(text, placeholder)
853        .to_string()
854}
855
856/// Normalize ordinal numbers in text
857#[allow(dead_code)]
858pub fn normalize_ordinals(text: &str, placeholder: &str) -> String {
859    ORDINAL_PATTERN.replace_all(text, placeholder).to_string()
860}
861
862/// Normalize dates in text
863#[allow(dead_code)]
864pub fn normalize_dates(text: &str, placeholder: &str) -> String {
865    DATE_PATTERN.replace_all(text, placeholder).to_string()
866}
867
868/// Normalize times in text
869#[allow(dead_code)]
870pub fn normalize_times(text: &str, placeholder: &str) -> String {
871    TIME_PATTERN.replace_all(text, placeholder).to_string()
872}
873
874/// Normalize fractions in text
875#[allow(dead_code)]
876pub fn normalize_fractions(text: &str, placeholder: &str) -> String {
877    FRACTION_PATTERN.replace_all(text, placeholder).to_string()
878}
879
880/// Normalize roman numerals in text
881#[allow(dead_code)]
882pub fn normalize_roman_numerals(text: &str, placeholder: &str) -> String {
883    ROMAN_NUMERAL_PATTERN
884        .replace_all(text, placeholder)
885        .to_string()
886}
887
888/// Normalize scientific notation in text
889#[allow(dead_code)]
890pub fn normalize_scientific_notation(text: &str, placeholder: &str) -> String {
891    SCIENTIFIC_NOTATION_PATTERN
892        .replace_all(text, placeholder)
893        .to_string()
894}
895
896/// Normalize temperatures in text
897#[allow(dead_code)]
898pub fn normalize_temperatures(text: &str, placeholder: &str) -> String {
899    TEMPERATURE_PATTERN
900        .replace_all(text, placeholder)
901        .to_string()
902}
903
904/// Normalize measurements in text
905#[allow(dead_code)]
906pub fn normalize_measurements(text: &str, placeholder: &str) -> String {
907    MEASUREMENT_PATTERN
908        .replace_all(text, placeholder)
909        .to_string()
910}
911
912/// Normalize version numbers in text
913#[allow(dead_code)]
914pub fn normalize_versions(text: &str, placeholder: &str) -> String {
915    VERSION_PATTERN.replace_all(text, placeholder).to_string()
916}
917
918/// Normalize IP addresses in text
919#[allow(dead_code)]
920pub fn normalize_ip_addresses(text: &str, placeholder: &str) -> String {
921    IP_ADDRESS_PATTERN
922        .replace_all(text, placeholder)
923        .to_string()
924}
925
926/// Normalize hexadecimal numbers in text
927#[allow(dead_code)]
928pub fn normalize_hex_numbers(text: &str, placeholder: &str) -> String {
929    HEX_PATTERN.replace_all(text, placeholder).to_string()
930}
931
932/// Normalize binary numbers in text
933#[allow(dead_code)]
934pub fn normalize_binary_numbers(text: &str, placeholder: &str) -> String {
935    BINARY_PATTERN.replace_all(text, placeholder).to_string()
936}
937
938/// Normalize all number formats comprehensively
939#[allow(dead_code)]
940pub fn normalize_all_numbers(text: &str, placeholder: &str) -> String {
941    let mut result = text.to_string();
942
943    // Apply in order of specificity (most specific first)
944    result = normalize_scientific_notation(&result, placeholder);
945    result = normalize_temperatures(&result, placeholder);
946    result = normalize_measurements(&result, placeholder);
947    result = normalize_hex_numbers(&result, placeholder);
948    result = normalize_binary_numbers(&result, placeholder);
949    result = normalize_currencies(&result, placeholder);
950    result = normalize_percentages(&result, placeholder);
951    result = normalize_fractions(&result, placeholder);
952    result = normalize_ordinals(&result, placeholder);
953    result = normalize_versions(&result, placeholder);
954    result = normalize_ip_addresses(&result, placeholder);
955    result = normalize_roman_numerals(&result, placeholder);
956    result = normalize_numbers(&result, placeholder);
957
958    result
959}
960
961#[cfg(test)]
962mod tests {
963    use super::*;
964
965    #[test]
966    fn test_strip_html_tags() {
967        let html = "<p>Hello <b>world</b>!</p>";
968        let cleaned = strip_html_tags(html);
969        assert_eq!(cleaned, " Hello  world ! ");
970    }
971
972    #[test]
973    fn test_replace_urls() {
974        let text = "Check out https://www.example.com for more info";
975        let replaced = replace_urls(text, "[URL]");
976        assert_eq!(replaced, "Check out [URL] for more info");
977    }
978
979    #[test]
980    fn test_replace_emails() {
981        let text = "Contact us at support@example.com for help";
982        let replaced = replace_emails(text, "[EMAIL]");
983        assert_eq!(replaced, "Contact us at [EMAIL] for help");
984    }
985
986    #[test]
987    fn test_expand_contractions() {
988        let text = "I can't believe it's working! They'll be happy.";
989        let expanded = expand_contractions(text);
990        assert_eq!(
991            expanded,
992            "I cannot believe it is working! They will be happy."
993        );
994    }
995
996    #[test]
997    fn test_normalize_whitespace() {
998        let text = "  Hello   world  \n\t  test  ";
999        let normalized = normalize_whitespace(text);
1000        assert_eq!(normalized, "Hello world test");
1001    }
1002
1003    #[test]
1004    fn test_remove_accents() {
1005        let text = "Héllo wörld café";
1006        let cleaned = remove_accents(text);
1007        assert_eq!(cleaned, "Hello world cafe");
1008    }
1009
1010    #[test]
1011    fn test_advanced_cleaner() {
1012        let cleaner = AdvancedTextCleaner::new();
1013        let text = "<p>Check out https://example.com! Email: test@example.com</p>";
1014        let cleaned = cleaner.clean(text).unwrap();
1015        assert_eq!(cleaned, "check out [url]! email: [email]");
1016    }
1017
1018    #[test]
1019    fn test_privacy_focused_cleaner() {
1020        let cleaner = AdvancedTextCleaner::privacy_focused();
1021        let text = "Call me at (555) 123-4567 or email john@example.com";
1022        let cleaned = cleaner.clean(text).unwrap();
1023        assert_eq!(cleaned, "call me at [phone] or email [email]");
1024    }
1025
1026    #[test]
1027    fn test_normalize_numbers() {
1028        let text = "The price is 1,234.56 and the quantity is 42";
1029        let normalized = normalize_numbers(text, "[NUM]");
1030        assert_eq!(normalized, "The price is [NUM] and the quantity is [NUM]");
1031
1032        let text_scientific = "The value is 3.14e-10";
1033        let normalized_sci = normalize_numbers(text_scientific, "[NUM]");
1034        assert_eq!(normalized_sci, "The value is [NUM]");
1035    }
1036
1037    #[test]
1038    fn test_normalize_currencies() {
1039        let text = "The cost is $45.99 or €50.00";
1040        let normalized = normalize_currencies(text, "[MONEY]");
1041        assert_eq!(normalized, "The cost is [MONEY] or [MONEY]");
1042
1043        let text_words = "It costs 100 dollars or 85 euros";
1044        let normalized_words = normalize_currencies(text_words, "[MONEY]");
1045        assert_eq!(normalized_words, "It costs [MONEY] or [MONEY]");
1046    }
1047
1048    #[test]
1049    fn test_normalize_percentages() {
1050        let text = "The growth is 25% and the decline is -5.5%";
1051        let normalized = normalize_percentages(text, "[PCT]");
1052        assert_eq!(normalized, "The growth is [PCT] and the decline is [PCT]");
1053    }
1054
1055    #[test]
1056    fn test_normalize_ordinals() {
1057        let text = "He came 1st, she was 2nd, and they were 3rd";
1058        let normalized = normalize_ordinals(text, "[ORD]");
1059        assert_eq!(
1060            normalized,
1061            "He came [ORD], she was [ORD], and they were [ORD]"
1062        );
1063    }
1064
1065    #[test]
1066    fn test_number_normalization_in_cleaner() {
1067        let cleaner = AdvancedTextCleaner::new()
1068            .set_normalize_numbers(true)
1069            .set_normalize_currencies(true)
1070            .set_normalize_percentages(true)
1071            .set_normalize_ordinals(true);
1072
1073        let text = "The 1st item costs $99.99 with a 15% discount, total: 84.99";
1074        let cleaned = cleaner.clean(text).unwrap();
1075        assert_eq!(
1076            cleaned,
1077            "the [ordinal] item costs [currency] with a [percent] discount, total: [number]"
1078        );
1079    }
1080
1081    #[test]
1082    fn test_normalize_dates() {
1083        let text = "Meeting on 12/25/2023, or December 25, 2023, or 25 December 2023";
1084        let normalized = normalize_dates(text, "[DATE]");
1085        assert_eq!(normalized, "Meeting on [DATE], or [DATE], or [DATE]");
1086    }
1087
1088    #[test]
1089    fn test_normalize_times() {
1090        let text = "Meeting at 14:30 or 2:30 PM or 09:00:15";
1091        let normalized = normalize_times(text, "[TIME]");
1092        assert_eq!(normalized, "Meeting at [TIME] or [TIME] or [TIME]");
1093    }
1094
1095    #[test]
1096    fn test_normalize_fractions() {
1097        let text = "Mix 1/2 cup flour with 2 3/4 cups sugar";
1098        let normalized = normalize_fractions(text, "[FRACTION]");
1099        assert_eq!(
1100            normalized,
1101            "Mix [FRACTION] cup flour with [FRACTION] cups sugar"
1102        );
1103    }
1104
1105    #[test]
1106    fn test_normalize_roman_numerals() {
1107        let text = "Chapter IV discusses Section XVII and Part III";
1108        let normalized = normalize_roman_numerals(text, "[ROMAN]");
1109        assert_eq!(
1110            normalized,
1111            "Chapter [ROMAN] discusses Section [ROMAN] and Part [ROMAN]"
1112        );
1113    }
1114
1115    #[test]
1116    fn test_normalize_scientific_notation() {
1117        let text = "Value is 6.022e23 or 1.23E-10";
1118        let normalized = normalize_scientific_notation(text, "[SCI]");
1119        assert_eq!(normalized, "Value is [SCI] or [SCI]");
1120    }
1121
1122    #[test]
1123    fn test_normalize_temperatures() {
1124        let text = "Temperature is 25°C or 77°F or 298K degrees kelvin";
1125        let normalized = normalize_temperatures(text, "[TEMP]");
1126        assert_eq!(normalized, "Temperature is [TEMP] or [TEMP] or [TEMP]");
1127    }
1128
1129    #[test]
1130    fn test_normalize_measurements() {
1131        let text = "Distance: 5km, weight: 2.5kg, speed: 60mph, storage: 1TB";
1132        let normalized = normalize_measurements(text, "[MEASURE]");
1133        assert_eq!(
1134            normalized,
1135            "Distance: [MEASURE], weight: [MEASURE], speed: [MEASURE], storage: [MEASURE]"
1136        );
1137    }
1138
1139    #[test]
1140    fn test_normalize_versions() {
1141        let text = "Using Python v3.9.1 and Node.js 16.14.0-alpha1";
1142        let normalized = normalize_versions(text, "[VER]");
1143        assert_eq!(normalized, "Using Python [VER] and Node.js [VER]");
1144    }
1145
1146    #[test]
1147    fn test_normalize_ip_addresses() {
1148        let text = "Server at 192.168.1.1 and backup at 10.0.0.255";
1149        let normalized = normalize_ip_addresses(text, "[IP]");
1150        assert_eq!(normalized, "Server at [IP] and backup at [IP]");
1151    }
1152
1153    #[test]
1154    fn test_normalize_hex_numbers() {
1155        let text = "Color #FF5733 or address 0x1A2B3C4D";
1156        let normalized = normalize_hex_numbers(text, "[HEX]");
1157        assert_eq!(normalized, "Color [HEX] or address [HEX]");
1158    }
1159
1160    #[test]
1161    fn test_normalize_binary_numbers() {
1162        let text = "Binary values: 0b1010 and 0b11110000";
1163        let normalized = normalize_binary_numbers(text, "[BIN]");
1164        assert_eq!(normalized, "Binary values: [BIN] and [BIN]");
1165    }
1166
1167    #[test]
1168    fn test_enhanced_currency_normalization() {
1169        let text = "Cost is $100.50, €75.25, ¥10000, or 50 USD";
1170        let cleaner = AdvancedTextCleaner::new().set_normalize_currencies(true);
1171        let cleaned = cleaner.clean(text).unwrap();
1172        assert_eq!(
1173            cleaned,
1174            "cost is [currency], [currency], [currency], or [currency]"
1175        );
1176    }
1177
1178    #[test]
1179    fn test_comprehensive_advanced_normalization() {
1180        let cleaner = AdvancedTextCleaner::new()
1181            .set_normalize_dates(true)
1182            .set_normalize_times(true)
1183            .set_normalize_fractions(true)
1184            .set_normalize_scientific_notation(true)
1185            .set_normalize_temperatures(true)
1186            .set_normalize_measurements(true)
1187            .set_normalize_versions(true)
1188            .set_normalize_ip_addresses(true)
1189            .set_normalize_hex_numbers(true)
1190            .set_normalize_binary_numbers(true);
1191
1192        let text = "On 12/25/2023 at 14:30, server 192.168.1.1 v2.1.0 measured 25°C, processed 0xFF data with 1/2 efficiency, used 6.022e23 molecules";
1193        let cleaned = cleaner.clean(text).unwrap();
1194
1195        // Should normalize all the different number/data types
1196        assert!(cleaned.contains("[date]"));
1197        assert!(cleaned.contains("[time]"));
1198        assert!(cleaned.contains("[ip]"));
1199        assert!(cleaned.contains("[version]"));
1200        assert!(cleaned.contains("[temp]"));
1201        assert!(cleaned.contains("[hex]"));
1202        assert!(cleaned.contains("[fraction]"));
1203        assert!(cleaned.contains("[scientific]"));
1204    }
1205
1206    #[test]
1207    fn test_privacy_focused_cleaner_with_advanced_features() {
1208        let cleaner = AdvancedTextCleaner::privacy_focused();
1209        let text = "Meeting on 01/15/2024 at 14:30, contact john@example.com or call (555) 123-4567, server: 192.168.1.100";
1210        let cleaned = cleaner.clean(text).unwrap();
1211
1212        // Privacy-focused should normalize sensitive information
1213        assert_eq!(
1214            cleaned,
1215            "meeting on [date] at [time], contact [email] or call [phone], server: [ip]"
1216        );
1217    }
1218
1219    #[test]
1220    fn test_normalize_all_numbers_function() {
1221        let text = "Value: 3.14e-10, temp: 25°C, price: $99.99, percent: 15%, ordinal: 1st, fraction: 1/2, hex: 0xFF, binary: 0b1010, IP: 192.168.1.1, version: v1.2.3, roman: IV";
1222        let normalized = normalize_all_numbers(text, "[NUM]");
1223
1224        // Should normalize all number types with the same placeholder
1225        assert_eq!(normalized, "Value: [NUM], temp: [NUM], price: [NUM], percent: [NUM], ordinal: [NUM], fraction: [NUM], hex: [NUM], binary: [NUM], IP: [NUM], version: [NUM], roman: [NUM]");
1226    }
1227
1228    #[test]
1229    fn test_advanced_placeholder_customization() {
1230        let cleaner = AdvancedTextCleaner::new()
1231            .set_normalize_dates(true)
1232            .set_normalize_temperatures(true)
1233            .set_normalize_hex_numbers(true)
1234            .set_advanced_placeholders(
1235                Some("[CUSTOM_DATE]".to_string()),
1236                None,
1237                None,
1238                None,
1239                None,
1240                Some("[CUSTOM_TEMP]".to_string()),
1241                None,
1242                None,
1243                None,
1244                Some("[CUSTOM_HEX]".to_string()),
1245                None,
1246            );
1247
1248        let text = "Date: 12/25/2023, temp: 25°C, color: #FF0000";
1249        let cleaned = cleaner.clean(text).unwrap();
1250        assert_eq!(
1251            cleaned,
1252            "date: [custom_date], temp: [custom_temp], color: [custom_hex]"
1253        );
1254    }
1255}