neutts 0.1.1 - Docs.rs

//! Text preprocessing pipeline.
//!
//! Converts raw input text (numbers, symbols, abbreviations, etc.)
//! into a clean spoken-word form before phonemisation.
//! Mirrors the preprocessing step in the Python NeuTTS pipeline.

use fancy_regex::{Captures, Regex};
use once_cell::sync::Lazy;
use std::borrow::Cow;

// ─────────────────────────────────────────────────────────────────────────────
// Number → words
// ─────────────────────────────────────────────────────────────────────────────

const ONES: &[&str] = &[
    "", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
    "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
    "seventeen", "eighteen", "nineteen",
];
const TENS: &[&str] = &["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"];
const SCALE: &[&str] = &["", "thousand", "million", "billion", "trillion"];

fn three_digits_to_words(n: u64) -> String {
    if n == 0 {
        return String::new();
    }
    let mut parts = Vec::new();
    let hundreds = n / 100;
    let remainder = n % 100;
    if hundreds > 0 {
        parts.push(format!("{} hundred", ONES[hundreds as usize]));
    }
    if remainder < 20 {
        if remainder > 0 {
            parts.push(ONES[remainder as usize].to_string());
        }
    } else {
        let tens_word = TENS[(remainder / 10) as usize];
        let ones_word = ONES[(remainder % 10) as usize];
        if ones_word.is_empty() {
            parts.push(tens_word.to_string());
        } else {
            parts.push(format!("{}-{}", tens_word, ones_word));
        }
    }
    parts.join(" ")
}

/// Convert a non-negative integer to English words.
pub fn number_to_words(n: i64) -> String {
    if n < 0 {
        return format!("negative {}", number_to_words(-n));
    }
    let n = n as u64;
    if n == 0 {
        return "zero".to_string();
    }
    // X00-X999 (not multiples of 1000): read as "X hundred"
    if n >= 100 && n <= 9999 && n % 100 == 0 && n % 1000 != 0 {
        let hundreds = n / 100;
        if hundreds < 20 {
            return format!("{} hundred", ONES[hundreds as usize]);
        }
    }
    let mut parts = Vec::new();
    let mut remaining = n;
    for (i, &scale) in SCALE.iter().enumerate() {
        let chunk = remaining % 1000;
        if chunk > 0 {
            let chunk_words = three_digits_to_words(chunk);
            if scale.is_empty() {
                parts.push(chunk_words);
            } else {
                parts.push(format!("{} {}", chunk_words, scale));
            }
        }
        remaining /= 1000;
        if remaining == 0 {
            let _ = i;
            break;
        }
    }
    parts.reverse();
    parts.join(" ")
}

/// Convert a float string to words, reading decimal digits individually.
pub fn float_to_words(value: &str) -> String {
    let negative = value.starts_with('-');
    let value = if negative { &value[1..] } else { value };

    let digit_words = |c: char| match c {
        '0' => "zero",
        '1' => "one",
        '2' => "two",
        '3' => "three",
        '4' => "four",
        '5' => "five",
        '6' => "six",
        '7' => "seven",
        '8' => "eight",
        '9' => "nine",
        _ => "",
    };

    let result = if let Some(dot) = value.find('.') {
        let int_part = &value[..dot];
        let dec_part = &value[dot + 1..];
        let int_words = if int_part.is_empty() {
            "zero".to_string()
        } else {
            number_to_words(int_part.parse::<i64>().unwrap_or(0))
        };
        let dec_words: Vec<&str> = dec_part.chars().map(digit_words).collect();
        format!("{} point {}", int_words, dec_words.join(" "))
    } else {
        number_to_words(value.parse::<i64>().unwrap_or(0))
    };

    if negative {
        format!("negative {}", result)
    } else {
        result
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Ordinals
// ─────────────────────────────────────────────────────────────────────────────

fn ordinal_suffix(n: i64) -> String {
    let word = number_to_words(n);
    let exceptions = [
        ("one", "first"),
        ("two", "second"),
        ("three", "third"),
        ("four", "fourth"),
        ("five", "fifth"),
        ("six", "sixth"),
        ("seven", "seventh"),
        ("eight", "eighth"),
        ("nine", "ninth"),
        ("twelve", "twelfth"),
    ];

    // Split on the last hyphen or space to get the final word
    let (prefix, last, sep) = if let Some(pos) = word.rfind('-') {
        (&word[..pos], &word[pos + 1..], "-")
    } else if let Some(pos) = word.rfind(' ') {
        (&word[..pos], &word[pos + 1..], " ")
    } else {
        ("", word.as_str(), "")
    };

    let last_ord = exceptions
        .iter()
        .find(|(base, _)| *base == last)
        .map(|(_, ord)| (*ord).to_string())
        .unwrap_or_else(|| {
            if last.ends_with('t') {
                // e.g. "eight" → "eighth"
                format!("{}h", last)
            } else if last.ends_with('e') {
                // e.g. "nine" → "ninth" (drop 'e', add "th")
                format!("{}th", &last[..last.len() - 1])
            } else if last.ends_with('y') {
                // e.g. "twenty" → "twentieth", "thirty" → "thirtieth"
                format!("{}ieth", &last[..last.len() - 1])
            } else {
                format!("{}th", last)
            }
        });

    if prefix.is_empty() {
        last_ord
    } else {
        format!("{}{}{}", prefix, sep, last_ord)
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Compiled regexes (lazily initialised once)
// ─────────────────────────────────────────────────────────────────────────────

static RE_ORDINAL: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\b(\d+)(st|nd|rd|th)\b").unwrap());
static RE_PERCENT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(-?[\d,]+(?:\.\d+)?)\s*%").unwrap());
static RE_CURRENCY: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"([$€£¥₹₩₿])\s*([\d,]+(?:\.\d+)?)\s*([KMBT])?(?![a-zA-Z\d])").unwrap());
static RE_TIME: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?i)\b(\d{1,2}):(\d{2})(?::\d{2})?\s*(am|pm)?\b").unwrap());
static RE_RANGE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?<!\w)(\d+)-(\d+)(?!\w)").unwrap());
static RE_MODEL_VER: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"\b([a-zA-Z][a-zA-Z0-9]*)-(\d[\d.]*)(?=[^\d.]|$)").unwrap());
static RE_UNIT: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r"(?i)(\d+(?:\.\d+)?)\s*(km|kg|mg|ml|gb|mb|kb|tb|hz|khz|mhz|ghz|mph|kph|°[cCfF]|[cCfF]°|ms|ns|µs)\b").unwrap()
});
static RE_SCALE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])").unwrap());
static RE_SCI: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])").unwrap());
static RE_FRACTION: Lazy<Regex> = Lazy::new(|| Regex::new(r"\b(\d+)\s*/\s*(\d+)\b").unwrap());
static RE_DECADE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\b(\d{1,3})0s\b").unwrap());
static RE_LEAD_DEC: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?<!\d)\.(\d)").unwrap());
static RE_NEG_LEAD_DEC: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?<!\d)(-)\.(\d)").unwrap());
static RE_NUMBER: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?").unwrap());
static RE_IP: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\b").unwrap());
static RE_PHONE_11: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?<!\d-)\b(\d{1,2})-(\d{3})-(\d{3})-(\d{4})\b(?!-\d)").unwrap());
static RE_PHONE_10: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?<!\d-)\b(\d{3})-(\d{3})-(\d{4})\b(?!-\d)").unwrap());
static RE_PHONE_7: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?<!\d-)\b(\d{3})-(\d{4})\b(?!-\d)").unwrap());
static RE_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r"https?://\S+|www\.\S+").unwrap());
static RE_EMAIL: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?i)\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b").unwrap());
static RE_HTML: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
static RE_SPACES: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
static RE_PUNCT: Lazy<Regex> = Lazy::new(|| Regex::new(r"[^\w\s]").unwrap());
static RE_CONTRACTION_CANT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\bcan't\b").unwrap());
static RE_CONTRACTION_WONT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\bwon't\b").unwrap());
static RE_CONTRACTION_SHANT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\bshan't\b").unwrap());
static RE_CONTRACTION_AINT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\bain't\b").unwrap());
static RE_CONTRACTION_LETS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\blet's\b").unwrap());
static RE_CONTRACTION_ITS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\bit's\b").unwrap());
static RE_CONTRACTION_NT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\b(\w+)n't\b").unwrap());
static RE_CONTRACTION_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\b(\w+)'re\b").unwrap());
static RE_CONTRACTION_VE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\b(\w+)'ve\b").unwrap());
static RE_CONTRACTION_LL: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\b(\w+)'ll\b").unwrap());
static RE_CONTRACTION_D: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\b(\w+)'d\b").unwrap());
static RE_CONTRACTION_M: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\b(\w+)'m\b").unwrap());

// ─────────────────────────────────────────────────────────────────────────────
// Expansion functions
// ─────────────────────────────────────────────────────────────────────────────

fn currency_symbol_name(sym: &str) -> &'static str {
    match sym {
        "$" => "dollar",
        "€" => "euro",
        "£" => "pound",
        "¥" => "yen",
        "₹" => "rupee",
        "₩" => "won",
        "₿" => "bitcoin",
        _ => "",
    }
}

fn scale_suffix_word(s: &str) -> &'static str {
    match s {
        "K" => "thousand",
        "M" => "million",
        "B" => "billion",
        "T" => "trillion",
        _ => "",
    }
}

fn digits_to_words(s: &str) -> String {
    s.chars()
        .map(|c| match c {
            '0' => "zero",
            '1' => "one",
            '2' => "two",
            '3' => "three",
            '4' => "four",
            '5' => "five",
            '6' => "six",
            '7' => "seven",
            '8' => "eight",
            '9' => "nine",
            _ => "",
        })
        .filter(|s| !s.is_empty())
        .collect::<Vec<_>>()
        .join(" ")
}

fn unit_expansion(unit: &str) -> &'static str {
    match unit.to_lowercase().as_str() {
        "km"       => "kilometers",
        "kg"       => "kilograms",
        "mg"       => "milligrams",
        "ml"       => "milliliters",
        "gb"       => "gigabytes",
        "mb"       => "megabytes",
        "kb"       => "kilobytes",
        "tb"       => "terabytes",
        "hz"       => "hertz",
        "khz"      => "kilohertz",
        "mhz"      => "megahertz",
        "ghz"      => "gigahertz",
        "mph"      => "miles per hour",
        "kph"      => "kilometers per hour",
        "ms"       => "milliseconds",
        "ns"       => "nanoseconds",
        "µs"       => "microseconds",
        "°c" | "c°" => "degrees Celsius",
        "°f" | "f°" => "degrees Fahrenheit",
        _          => "",        // caller falls back to the raw unit string
    }
}

pub fn expand_ordinals(text: &str) -> String {
    RE_ORDINAL
        .replace_all(text, |caps: &Captures| {
            let n: i64 = caps[1].parse().unwrap_or(0);
            ordinal_suffix(n)
        })
        .into_owned()
}

pub fn expand_percentages(text: &str) -> String {
    RE_PERCENT
        .replace_all(text, |caps: &Captures| {
            let raw = caps[1].replace(',', "");
            let words = if raw.contains('.') {
                float_to_words(&raw)
            } else {
                number_to_words(raw.parse::<i64>().unwrap_or(0))
            };
            format!("{} percent", words)
        })
        .into_owned()
}

pub fn expand_currency(text: &str) -> String {
    RE_CURRENCY
        .replace_all(text, |caps: &Captures| {
            let symbol = &caps[1];
            let raw = caps[2].replace(',', "");
            let scale_suffix = caps.get(3).map(|m| m.as_str()).unwrap_or("");
            let unit = currency_symbol_name(symbol);

            if !scale_suffix.is_empty() {
                let scale_word = scale_suffix_word(scale_suffix);
                let num = if raw.contains('.') {
                    float_to_words(&raw)
                } else {
                    number_to_words(raw.parse::<i64>().unwrap_or(0))
                };
                return format!("{} {} {}s", num, scale_word, unit);
            }

            if raw.contains('.') {
                let dot = raw.find('.').unwrap();
                let int_part: i64 = raw[..dot].parse().unwrap_or(0);
                let dec_str = &raw[dot + 1..];
                let dec_val: i64 = dec_str[..dec_str.len().min(2)]
                    .chars()
                    .chain(std::iter::repeat('0'))
                    .take(2)
                    .collect::<String>()
                    .parse()
                    .unwrap_or(0);
                let int_words = number_to_words(int_part);
                let mut result = if unit.is_empty() {
                    int_words
                } else {
                    format!("{} {}s", int_words, unit)
                };
                if dec_val > 0 {
                    let cents = number_to_words(dec_val);
                    let plural = if dec_val == 1 { "" } else { "s" };
                    result.push_str(&format!(" and {} cent{}", cents, plural));
                }
                result
            } else {
                let val: i64 = raw.parse().unwrap_or(0);
                let words = number_to_words(val);
                if unit.is_empty() {
                    words
                } else {
                    let plural = if val == 1 { "" } else { "s" };
                    format!("{} {}{}", words, unit, plural)
                }
            }
        })
        .into_owned()
}

pub fn expand_time(text: &str) -> String {
    RE_TIME
        .replace_all(text, |caps: &Captures| {
            let h: i64 = caps[1].parse().unwrap_or(0);
            let mins: i64 = caps[2].parse().unwrap_or(0);
            let suffix = caps
                .get(3)
                .map(|m| format!(" {}", m.as_str().to_lowercase()))
                .unwrap_or_default();
            let h_words = number_to_words(h);
            if mins == 0 {
                if caps.get(3).is_some() {
                    format!("{}{}", h_words, suffix)
                } else {
                    format!("{} hundred", h_words)
                }
            } else if mins < 10 {
                format!("{} oh {}{}", h_words, number_to_words(mins), suffix)
            } else {
                format!("{} {}{}", h_words, number_to_words(mins), suffix)
            }
        })
        .into_owned()
}

pub fn expand_ranges(text: &str) -> String {
    RE_RANGE
        .replace_all(text, |caps: &Captures| {
            let lo = number_to_words(caps[1].parse().unwrap_or(0));
            let hi = number_to_words(caps[2].parse().unwrap_or(0));
            format!("{} to {}", lo, hi)
        })
        .into_owned()
}

pub fn expand_model_names(text: &str) -> String {
    RE_MODEL_VER
        .replace_all(text, |caps: &Captures| {
            format!("{} {}", &caps[1], &caps[2])
        })
        .into_owned()
}

pub fn expand_units(text: &str) -> String {
    RE_UNIT
        .replace_all(text, |caps: &Captures| {
            let raw = &caps[1];
            let unit = &caps[2];
            let expanded = unit_expansion(unit);
            let expanded = if expanded.is_empty() { unit.as_ref() } else { expanded };
            let num = if raw.contains('.') {
                float_to_words(raw)
            } else {
                number_to_words(raw.parse::<i64>().unwrap_or(0))
            };
            format!("{} {}", num, expanded)
        })
        .into_owned()
}

pub fn expand_scale_suffixes(text: &str) -> String {
    RE_SCALE
        .replace_all(text, |caps: &Captures| {
            let raw = &caps[1];
            let suffix = &caps[2];
            let scale_word = scale_suffix_word(suffix);
            let num = if raw.contains('.') {
                float_to_words(raw)
            } else {
                number_to_words(raw.parse::<i64>().unwrap_or(0))
            };
            format!("{} {}", num, scale_word)
        })
        .into_owned()
}

pub fn expand_scientific_notation(text: &str) -> String {
    RE_SCI
        .replace_all(text, |caps: &Captures| {
            let coeff = &caps[1];
            let exp: i64 = caps[2].parse().unwrap_or(0);
            let coeff_words = if coeff.contains('.') {
                float_to_words(coeff)
            } else {
                number_to_words(coeff.parse::<i64>().unwrap_or(0))
            };
            let exp_words = number_to_words(exp.abs());
            let sign = if exp < 0 { "negative " } else { "" };
            format!("{} times ten to the {}{}", coeff_words, sign, exp_words)
        })
        .into_owned()
}

pub fn expand_fractions(text: &str) -> String {
    RE_FRACTION
        .replace_all(text, |caps: &Captures| {
            let num: i64 = caps[1].parse().unwrap_or(0);
            let den: i64 = caps[2].parse().unwrap_or(1);
            if den == 0 {
                return caps[0].to_string();
            }
            let num_words = number_to_words(num);
            let denom_word = match den {
                2 => (if num == 1 { "half" } else { "halves" }).to_string(),
                4 => (if num == 1 { "quarter" } else { "quarters" }).to_string(),
                _ => {
                    let ord = ordinal_suffix(den);
                    if num != 1 {
                        format!("{}s", ord)
                    } else {
                        ord
                    }
                }
            };
            format!("{} {}", num_words, denom_word)
        })
        .into_owned()
}

pub fn expand_decades(text: &str) -> String {
    let decade_map = [
        (0, "hundreds"),
        (1, "tens"),
        (2, "twenties"),
        (3, "thirties"),
        (4, "forties"),
        (5, "fifties"),
        (6, "sixties"),
        (7, "seventies"),
        (8, "eighties"),
        (9, "nineties"),
    ];
    RE_DECADE
        .replace_all(text, |caps: &Captures| {
            let base: i64 = caps[1].parse().unwrap_or(0);
            let decade_digit = (base % 10) as usize;
            let decade_word = decade_map
                .iter()
                .find(|(d, _)| *d == decade_digit as i64)
                .map(|(_, w)| *w)
                .unwrap_or("");
            if base < 10 {
                decade_word.to_string()
            } else {
                let century_part = base / 10;
                format!("{} {}", number_to_words(century_part), decade_word)
            }
        })
        .into_owned()
}

pub fn expand_ip_addresses(text: &str) -> String {
    RE_IP
        .replace_all(text, |caps: &Captures| {
            let octets: Vec<String> = (1..=4)
                .map(|i| digits_to_words(&caps[i]))
                .collect();
            octets.join(" dot ")
        })
        .into_owned()
}

pub fn expand_phone_numbers(text: &str) -> String {
    let join_groups = |groups: Vec<&str>| -> String {
        groups.iter().map(|g| digits_to_words(g)).collect::<Vec<_>>().join(" ")
    };
    // 11-digit first
    let text = RE_PHONE_11
        .replace_all(text, |caps: &Captures| {
            join_groups(vec![&caps[1], &caps[2], &caps[3], &caps[4]])
        })
        .into_owned();
    // 10-digit
    let text = RE_PHONE_10
        .replace_all(&text, |caps: &Captures| {
            join_groups(vec![&caps[1], &caps[2], &caps[3]])
        })
        .into_owned();
    // 7-digit
    RE_PHONE_7
        .replace_all(&text, |caps: &Captures| {
            join_groups(vec![&caps[1], &caps[2]])
        })
        .into_owned()
}

pub fn normalize_leading_decimals(text: &str) -> String {
    // Handle -.5 → -0.5
    let text = RE_NEG_LEAD_DEC
        .replace_all(text, |caps: &Captures| {
            format!("{}0.{}", &caps[1], &caps[2])
        })
        .into_owned();
    // Handle .5 → 0.5
    RE_LEAD_DEC
        .replace_all(&text, |caps: &Captures| format!("0.{}", &caps[1]))
        .into_owned()
}

pub fn replace_numbers(text: &str) -> String {
    RE_NUMBER
        .replace_all(text, |caps: &Captures| {
            let raw = caps[0].replace(',', "");
            if raw.contains('.') {
                float_to_words(&raw)
            } else if let Ok(n) = raw.parse::<i64>() {
                number_to_words(n)
            } else {
                caps[0].to_string()
            }
        })
        .into_owned()
}

pub fn expand_contractions(text: &str) -> String {
    let text = RE_CONTRACTION_CANT.replace_all(text, "cannot").into_owned();
    let text = RE_CONTRACTION_WONT.replace_all(&text, "will not").into_owned();
    let text = RE_CONTRACTION_SHANT.replace_all(&text, "shall not").into_owned();
    let text = RE_CONTRACTION_AINT.replace_all(&text, "is not").into_owned();
    let text = RE_CONTRACTION_LETS.replace_all(&text, "let us").into_owned();
    let text = RE_CONTRACTION_ITS.replace_all(&text, "it is").into_owned();
    let text = RE_CONTRACTION_NT.replace_all(&text, "$1 not").into_owned();
    let text = RE_CONTRACTION_RE.replace_all(&text, "$1 are").into_owned();
    let text = RE_CONTRACTION_VE.replace_all(&text, "$1 have").into_owned();
    let text = RE_CONTRACTION_LL.replace_all(&text, "$1 will").into_owned();
    let text = RE_CONTRACTION_D.replace_all(&text, "$1 would").into_owned();
    RE_CONTRACTION_M.replace_all(&text, "$1 am").into_owned()
}

pub fn remove_urls(text: &str) -> Cow<'_, str> {
    RE_URL.replace_all(text, "")
}

pub fn remove_emails(text: &str) -> Cow<'_, str> {
    RE_EMAIL.replace_all(text, "")
}

pub fn remove_html_tags(text: &str) -> Cow<'_, str> {
    RE_HTML.replace_all(text, " ")
}

pub fn remove_punctuation(text: &str) -> Cow<'_, str> {
    RE_PUNCT.replace_all(text, " ")
}

pub fn remove_extra_whitespace(text: &str) -> String {
    RE_SPACES.replace_all(text.trim(), " ").into_owned()
}

// ─────────────────────────────────────────────────────────────────────────────
// TextPreprocessor — full pipeline
// ─────────────────────────────────────────────────────────────────────────────

/// Configuration for the text preprocessing pipeline.
#[derive(Debug, Clone)]
pub struct PreprocessorConfig {
    pub lowercase: bool,
    pub replace_numbers: bool,
    pub expand_contractions: bool,
    pub expand_model_names: bool,
    pub expand_ordinals: bool,
    pub expand_percentages: bool,
    pub expand_currency: bool,
    pub expand_time: bool,
    pub expand_ranges: bool,
    pub expand_units: bool,
    pub expand_scale_suffixes: bool,
    pub expand_scientific_notation: bool,
    pub expand_fractions: bool,
    pub expand_decades: bool,
    pub expand_phone_numbers: bool,
    pub expand_ip_addresses: bool,
    pub normalize_leading_decimals: bool,
    pub remove_urls: bool,
    pub remove_emails: bool,
    pub remove_html: bool,
    pub remove_punctuation: bool,
    pub remove_extra_whitespace: bool,
}

impl Default for PreprocessorConfig {
    fn default() -> Self {
        Self {
            lowercase: true,
            replace_numbers: true,
            expand_contractions: true,
            expand_model_names: true,
            expand_ordinals: true,
            expand_percentages: true,
            expand_currency: true,
            expand_time: true,
            expand_ranges: true,
            expand_units: true,
            expand_scale_suffixes: true,
            expand_scientific_notation: true,
            expand_fractions: true,
            expand_decades: true,
            expand_phone_numbers: true,
            expand_ip_addresses: true,
            normalize_leading_decimals: true,
            remove_urls: true,
            remove_emails: true,
            remove_html: true,
            remove_punctuation: true,
            remove_extra_whitespace: true,
        }
    }
}

/// Full text preprocessing pipeline — mirrors Python's `TextPreprocessor`.
pub struct TextPreprocessor {
    pub config: PreprocessorConfig,
}

impl Default for TextPreprocessor {
    fn default() -> Self {
        Self { config: PreprocessorConfig::default() }
    }
}

impl TextPreprocessor {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn with_config(config: PreprocessorConfig) -> Self {
        Self { config }
    }

    pub fn process(&self, text: &str) -> String {
        let cfg = &self.config;
        let mut text = text.to_string();

        if cfg.remove_html {
            text = remove_html_tags(&text).into_owned();
        }
        if cfg.remove_urls {
            text = remove_urls(&text).into_owned();
        }
        if cfg.remove_emails {
            text = remove_emails(&text).into_owned();
        }
        if cfg.expand_contractions {
            text = expand_contractions(&text);
        }
        if cfg.expand_ip_addresses {
            text = expand_ip_addresses(&text);
        }
        if cfg.normalize_leading_decimals {
            text = normalize_leading_decimals(&text);
        }
        if cfg.expand_currency {
            text = expand_currency(&text);
        }
        if cfg.expand_percentages {
            text = expand_percentages(&text);
        }
        if cfg.expand_scientific_notation {
            text = expand_scientific_notation(&text);
        }
        if cfg.expand_time {
            text = expand_time(&text);
        }
        if cfg.expand_ordinals {
            text = expand_ordinals(&text);
        }
        if cfg.expand_units {
            text = expand_units(&text);
        }
        if cfg.expand_scale_suffixes {
            text = expand_scale_suffixes(&text);
        }
        if cfg.expand_fractions {
            text = expand_fractions(&text);
        }
        if cfg.expand_decades {
            text = expand_decades(&text);
        }
        if cfg.expand_phone_numbers {
            text = expand_phone_numbers(&text);
        }
        if cfg.expand_ranges {
            text = expand_ranges(&text);
        }
        if cfg.expand_model_names {
            text = expand_model_names(&text);
        }
        if cfg.replace_numbers {
            text = replace_numbers(&text);
        }
        if cfg.remove_punctuation {
            text = remove_punctuation(&text).into_owned();
        }
        if cfg.lowercase {
            text = text.to_lowercase();
        }
        if cfg.remove_extra_whitespace {
            text = remove_extra_whitespace(&text);
        }

        text
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_number_to_words() {
        assert_eq!(number_to_words(0), "zero");
        assert_eq!(number_to_words(1), "one");
        assert_eq!(number_to_words(12), "twelve");
        assert_eq!(number_to_words(1200), "twelve hundred");
        assert_eq!(number_to_words(1000), "one thousand");
        assert_eq!(number_to_words(-42), "negative forty-two");
        assert_eq!(number_to_words(1_000_000), "one million");
    }

    #[test]
    fn test_float_to_words() {
        assert_eq!(float_to_words("3.14"), "three point one four");
        assert_eq!(float_to_words("-0.5"), "negative zero point five");
        assert_eq!(float_to_words("1.50"), "one point five zero");
    }

    #[test]
    fn test_ordinals() {
        let pp = TextPreprocessor::new();
        let result = pp.process("She finished 1st, he came 2nd, I was 3rd.");
        assert!(result.contains("first"), "got: {}", result);
        assert!(result.contains("second"), "got: {}", result);
        assert!(result.contains("third"), "got: {}", result);
    }

    #[test]
    fn test_percentages() {
        assert_eq!(
            TextPreprocessor::new().process("50% off"),
            "fifty percent off"
        );
    }

    #[test]
    fn test_currency() {
        let out = TextPreprocessor::new().process("$4.99");
        // "four dollars and ninety-nine cents"; hyphens removed by remove_punctuation
        assert!(out.contains("four dollar"), "got: {}", out);
        assert!(out.contains("ninety nine cent"), "got: {}", out);
    }

    #[test]
    fn test_contractions() {
        let out = TextPreprocessor::new().process("I don't know");
        assert!(out.contains("do not"), "got: {}", out);
    }

    #[test]
    fn test_scale_suffixes() {
        let out = TextPreprocessor::new().process("a 7B parameter model");
        assert!(out.contains("seven billion"), "got: {}", out);
    }

    #[test]
    fn test_scientific_notation() {
        let out = TextPreprocessor::new().process("lr 1e-4");
        assert!(out.contains("times ten to the"), "got: {}", out);
    }

    #[test]
    fn test_full_pipeline() {
        let pp = TextPreprocessor::new();
        let out = pp.process("GPT-4 scored 90% in 3.5 seconds at 1e-4 lr.");
        // Should be all lowercase, no punctuation, numbers expanded
        assert!(out.chars().all(|c| c.is_lowercase() || c == ' '), "got: {}", out);
    }
}