rehuman 0.1.3 - Docs.rs

//! rehuman — Unicode‑safe text cleaning & typographic normalization.

use deunicode::deunicode_char;
use icu_properties::{props, CodePointSetData, CodePointSetDataBorrowed};
use serde::Serialize;
use std::borrow::Cow;
use std::fmt;
#[cfg(feature = "unorm")]
use unicode_normalization::UnicodeNormalization;
use unicode_segmentation::UnicodeSegmentation;

mod generated;
mod sets;
use generated::{DASH_MAP, QUOTE_MAP, SPACE_MAP};
pub use sets::{is_emoji, is_extended_keyboard_char, is_hidden_char, is_keyboard_ascii};

const FRACTION_SLASH: char = '\u{2044}';
const HORIZONTAL_ELLIPSIS: char = '\u{2026}';
const MIDLINE_HORIZONTAL_ELLIPSIS: char = '\u{22EF}';

/// Unicode normalization modes.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UnicodeNormalizationMode {
    None,
    NFD,
    NFC,
    NFKD,
    NFKC,
}

/// Line ending styles.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LineEndingStyle {
    Lf,   // \n
    Crlf, // \r\n
    Cr,   // \r
}

/// Policy for emoji handling when `keyboard_only` is enabled.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EmojiPolicy {
    Keep,
    Drop,
}

/// Policy for handling non-ASCII graphemes in `keyboard_only` mode.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NonAsciiPolicy {
    /// Remove non-ASCII graphemes when they are not explicitly normalized elsewhere.
    Drop,
    /// Keep only compatibility-decomposed ASCII output.
    Fold,
    /// Fold first, then apply transliteration fallbacks before dropping.
    Transliterate,
}

/// Detailed statistics about cleaning operations.
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize)]
pub struct CleaningStats {
    pub hidden_chars_removed: u64,
    pub trailing_whitespace_removed: u64,
    pub spaces_normalized: u64,
    pub dashes_normalized: u64,
    pub quotes_normalized: u64,
    pub other_normalized: u64,
    pub control_chars_removed: u64,
    pub line_endings_normalized: u64,
    pub non_keyboard_removed: u64,
    pub non_keyboard_transliterated: u64,
    pub emojis_dropped: u64,
    #[cfg(feature = "security")]
    pub bidi_controls_removed: u64,
}

/// Result of a text cleaning operation.
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct CleaningResult<'a> {
    pub text: Cow<'a, str>,
    pub changes_made: u64,
    pub stats: CleaningStats,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
/// Errors produced by fallible cleaning APIs.
pub enum CleaningError {
    /// A Unicode normalization mode was requested without the `unorm` feature.
    NormalizationUnavailable { requested: UnicodeNormalizationMode },
}

impl fmt::Display for CleaningError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            CleaningError::NormalizationUnavailable { requested } => write!(
                f,
                "Unicode normalization {:?} requested but the 'unorm' feature is disabled",
                requested
            ),
        }
    }
}

impl std::error::Error for CleaningError {}

impl CleaningStats {
    /// Merge another stats snapshot into this one.
    ///
    /// # Arguments
    /// - `other`: Additional counters to accumulate into `self`.
    #[cfg(feature = "stats")]
    pub fn accumulate(&mut self, other: &CleaningStats) {
        self.hidden_chars_removed = self
            .hidden_chars_removed
            .saturating_add(other.hidden_chars_removed);
        self.trailing_whitespace_removed = self
            .trailing_whitespace_removed
            .saturating_add(other.trailing_whitespace_removed);
        self.spaces_normalized = self
            .spaces_normalized
            .saturating_add(other.spaces_normalized);
        self.dashes_normalized = self
            .dashes_normalized
            .saturating_add(other.dashes_normalized);
        self.quotes_normalized = self
            .quotes_normalized
            .saturating_add(other.quotes_normalized);
        self.other_normalized = self.other_normalized.saturating_add(other.other_normalized);
        self.control_chars_removed = self
            .control_chars_removed
            .saturating_add(other.control_chars_removed);
        self.line_endings_normalized = self
            .line_endings_normalized
            .saturating_add(other.line_endings_normalized);
        self.non_keyboard_removed = self
            .non_keyboard_removed
            .saturating_add(other.non_keyboard_removed);
        self.non_keyboard_transliterated = self
            .non_keyboard_transliterated
            .saturating_add(other.non_keyboard_transliterated);
        self.emojis_dropped = self.emojis_dropped.saturating_add(other.emojis_dropped);
        #[cfg(feature = "security")]
        {
            self.bidi_controls_removed = self
                .bidi_controls_removed
                .saturating_add(other.bidi_controls_removed);
        }
    }

    #[cfg(not(feature = "stats"))]
    #[inline]
    /// No-op stats accumulation when the `stats` feature is disabled.
    ///
    /// # Arguments
    /// - `_`: Ignored stats payload.
    pub fn accumulate(&mut self, _: &CleaningStats) {
        // No-op when stats are disabled.
    }
}

#[cfg(feature = "stats")]
macro_rules! record_stat {
    ($stats:expr, $field:ident, $amount:expr) => {{
        $stats.$field = $stats.$field.saturating_add($amount);
    }};
}

#[cfg(not(feature = "stats"))]
macro_rules! record_stat {
    ($stats:expr, $field:ident, $amount:expr) => {{
        let _ = &$stats;
        let _ = stringify!($field);
        let _ = &$amount;
    }};
}

macro_rules! record_change {
    ($changes:expr, $stats:expr, $field:ident) => {{
        record_change!($changes, $stats, $field, 1u64);
    }};
    ($changes:expr, $stats:expr, $field:ident, $amount:expr) => {{
        let amount = ($amount) as u64;
        $changes = $changes.saturating_add(amount);
        record_stat!($stats, $field, amount);
    }};
}

/// Configuration for cleaning.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CleaningOptions {
    pub remove_hidden: bool,
    pub remove_trailing_whitespace: bool,
    pub normalize_spaces: bool,
    pub normalize_dashes: bool,
    pub normalize_quotes: bool,
    pub normalize_other: bool, // ellipsis (… -> ...), etc.
    pub keyboard_only: bool,
    pub extended_keyboard: bool, // curated non-ASCII allowlist in keyboard mode
    pub emoji_policy: EmojiPolicy, // effective only if keyboard_only = true
    pub non_ascii_policy: NonAsciiPolicy, // effective only if keyboard_only = true
    pub preserve_joiners: bool,  // keep ZWJ/ZWNJ when remove_hidden is enabled
    pub remove_control_chars: bool, // remove Cc excluding \n, \r, \t
    pub collapse_whitespace: bool,
    pub normalize_line_endings: Option<LineEndingStyle>,
    pub unicode_normalization: UnicodeNormalizationMode,
    #[cfg_attr(not(feature = "security"), doc(hidden))]
    pub strip_bidi_controls: bool,
}

#[derive(Debug, Clone)]
/// Builder for [`CleaningOptions`].
pub struct CleaningOptionsBuilder {
    options: CleaningOptions,
}

impl Default for CleaningOptions {
    fn default() -> Self {
        Self {
            remove_hidden: true,
            remove_trailing_whitespace: true,
            normalize_spaces: true,
            normalize_dashes: true,
            normalize_quotes: true,
            normalize_other: true,
            keyboard_only: true,
            extended_keyboard: false,
            emoji_policy: EmojiPolicy::Drop,
            non_ascii_policy: NonAsciiPolicy::Transliterate,
            preserve_joiners: false,
            remove_control_chars: true,
            collapse_whitespace: false,
            normalize_line_endings: None,
            unicode_normalization: UnicodeNormalizationMode::None,
            strip_bidi_controls: false,
        }
    }
}

impl CleaningOptions {
    /// Start a new [`CleaningOptionsBuilder`] with default values.
    ///
    /// # Returns
    /// A builder initialized from [`CleaningOptions::default`].
    pub fn builder() -> CleaningOptionsBuilder {
        CleaningOptionsBuilder {
            options: CleaningOptions::default(),
        }
    }

    /// Minimal preset: only removes hidden/invisible chars.
    ///
    /// # Returns
    /// A conservative preset that performs minimal transformations.
    pub fn minimal() -> Self {
        Self {
            remove_hidden: true,
            remove_trailing_whitespace: false,
            normalize_spaces: false,
            normalize_dashes: false,
            normalize_quotes: false,
            normalize_other: false,
            keyboard_only: false,
            extended_keyboard: false,
            emoji_policy: EmojiPolicy::Drop,
            non_ascii_policy: NonAsciiPolicy::Transliterate,
            preserve_joiners: false,
            remove_control_chars: false,
            collapse_whitespace: false,
            normalize_line_endings: None,
            unicode_normalization: UnicodeNormalizationMode::None,
            strip_bidi_controls: false,
        }
    }

    /// Balanced preset for day-to-day text.
    ///
    /// # Returns
    /// A general-purpose preset for normal prose cleanup.
    pub fn balanced() -> Self {
        Self {
            remove_hidden: true,
            remove_trailing_whitespace: true,
            normalize_spaces: true,
            normalize_dashes: true,
            normalize_quotes: true,
            normalize_other: true,
            keyboard_only: false,
            extended_keyboard: false,
            emoji_policy: EmojiPolicy::Drop,
            non_ascii_policy: NonAsciiPolicy::Transliterate,
            preserve_joiners: false,
            remove_control_chars: true,
            unicode_normalization: UnicodeNormalizationMode::NFC,
            collapse_whitespace: false,
            normalize_line_endings: None,
            strip_bidi_controls: false,
        }
    }

    /// Humanize preset for AI/LLM-ish text.
    ///
    /// # Returns
    /// A preset tuned for typographic normalization and whitespace cleanup.
    pub fn humanize() -> Self {
        Self {
            remove_hidden: true,
            remove_trailing_whitespace: true,
            normalize_spaces: true,
            normalize_dashes: true,
            normalize_quotes: true,
            normalize_other: true,
            keyboard_only: false,
            extended_keyboard: false,
            emoji_policy: EmojiPolicy::Drop,
            non_ascii_policy: NonAsciiPolicy::Transliterate,
            preserve_joiners: false,
            remove_control_chars: true,
            unicode_normalization: UnicodeNormalizationMode::NFKC,
            collapse_whitespace: true,
            normalize_line_endings: None,
            strip_bidi_controls: false,
        }
    }

    /// Aggressive preset: maximum cleanup.
    ///
    /// # Returns
    /// A strict preset that targets keyboard-safe output.
    pub fn aggressive() -> Self {
        Self {
            remove_hidden: true,
            remove_trailing_whitespace: true,
            normalize_spaces: true,
            normalize_dashes: true,
            normalize_quotes: true,
            normalize_other: true,
            keyboard_only: true,
            extended_keyboard: false,
            emoji_policy: EmojiPolicy::Drop,
            non_ascii_policy: NonAsciiPolicy::Transliterate,
            preserve_joiners: false,
            remove_control_chars: true,
            collapse_whitespace: true,
            normalize_line_endings: Some(LineEndingStyle::Lf),
            unicode_normalization: UnicodeNormalizationMode::NFKC,
            strip_bidi_controls: true,
        }
    }

    /// Code-safe preset for docs/source-like content.
    ///
    /// # Returns
    /// A preset that preserves semantic punctuation and Unicode glyphs while
    /// still removing hidden/control noise.
    pub fn code_safe() -> Self {
        Self {
            remove_hidden: true,
            remove_trailing_whitespace: true,
            normalize_spaces: true,
            normalize_dashes: false,
            normalize_quotes: false,
            normalize_other: false,
            keyboard_only: false,
            extended_keyboard: false,
            emoji_policy: EmojiPolicy::Keep,
            non_ascii_policy: NonAsciiPolicy::Transliterate,
            preserve_joiners: true,
            remove_control_chars: true,
            collapse_whitespace: false,
            normalize_line_endings: None,
            unicode_normalization: UnicodeNormalizationMode::None,
            strip_bidi_controls: false,
        }
    }
}

impl CleaningOptionsBuilder {
    /// Set `remove_hidden`.
    ///
    /// # Arguments
    /// - `value`: Whether to remove default-ignorable code points.
    ///
    /// # Returns
    /// Updated builder.
    pub fn remove_hidden(mut self, value: bool) -> Self {
        self.options.remove_hidden = value;
        self
    }

    /// Set `remove_trailing_whitespace`.
    ///
    /// # Arguments
    /// - `value`: Whether trailing spaces/tabs are trimmed per line.
    ///
    /// # Returns
    /// Updated builder.
    pub fn remove_trailing_whitespace(mut self, value: bool) -> Self {
        self.options.remove_trailing_whitespace = value;
        self
    }

    /// Set `normalize_spaces`.
    ///
    /// # Arguments
    /// - `value`: Whether Unicode spaces normalize to ASCII space.
    ///
    /// # Returns
    /// Updated builder.
    pub fn normalize_spaces(mut self, value: bool) -> Self {
        self.options.normalize_spaces = value;
        self
    }

    /// Set `normalize_dashes`.
    ///
    /// # Arguments
    /// - `value`: Whether Unicode dash variants normalize to `-`.
    ///
    /// # Returns
    /// Updated builder.
    pub fn normalize_dashes(mut self, value: bool) -> Self {
        self.options.normalize_dashes = value;
        self
    }

    /// Set `normalize_quotes`.
    ///
    /// # Arguments
    /// - `value`: Whether curly/Unicode quotes normalize to ASCII quotes.
    ///
    /// # Returns
    /// Updated builder.
    pub fn normalize_quotes(mut self, value: bool) -> Self {
        self.options.normalize_quotes = value;
        self
    }

    /// Set `normalize_other`.
    ///
    /// # Arguments
    /// - `value`: Whether miscellaneous replacements (for example ellipsis)
    ///   are applied.
    ///
    /// # Returns
    /// Updated builder.
    pub fn normalize_other(mut self, value: bool) -> Self {
        self.options.normalize_other = value;
        self
    }

    /// Set `keyboard_only`.
    ///
    /// # Arguments
    /// - `value`: Whether output is restricted to keyboard-safe characters.
    ///
    /// # Returns
    /// Updated builder.
    pub fn keyboard_only(mut self, value: bool) -> Self {
        self.options.keyboard_only = value;
        self
    }

    /// Set `extended_keyboard`.
    ///
    /// # Arguments
    /// - `value`: Whether curated non-ASCII keyboard characters are allowed.
    ///
    /// # Returns
    /// Updated builder.
    pub fn extended_keyboard(mut self, value: bool) -> Self {
        self.options.extended_keyboard = value;
        self
    }

    /// Set `emoji_policy`.
    ///
    /// # Arguments
    /// - `policy`: Emoji handling policy when `keyboard_only` is enabled.
    ///
    /// # Returns
    /// Updated builder.
    pub fn emoji_policy(mut self, policy: EmojiPolicy) -> Self {
        self.options.emoji_policy = policy;
        self
    }

    /// Set `non_ascii_policy`.
    ///
    /// # Arguments
    /// - `policy`: Non-ASCII handling strategy in keyboard-only mode.
    ///
    /// # Returns
    /// Updated builder.
    pub fn non_ascii_policy(mut self, policy: NonAsciiPolicy) -> Self {
        self.options.non_ascii_policy = policy;
        self
    }

    /// Set `preserve_joiners`.
    ///
    /// # Arguments
    /// - `value`: Whether ZWJ/ZWNJ are preserved when `remove_hidden` is enabled.
    ///
    /// # Returns
    /// Updated builder.
    pub fn preserve_joiners(mut self, value: bool) -> Self {
        self.options.preserve_joiners = value;
        self
    }

    /// Set `remove_control_chars`.
    ///
    /// # Arguments
    /// - `value`: Whether control characters are removed.
    ///
    /// # Returns
    /// Updated builder.
    pub fn remove_control_chars(mut self, value: bool) -> Self {
        self.options.remove_control_chars = value;
        self
    }

    /// Set `collapse_whitespace`.
    ///
    /// # Arguments
    /// - `value`: Whether consecutive whitespace runs collapse to single spaces.
    ///
    /// # Returns
    /// Updated builder.
    pub fn collapse_whitespace(mut self, value: bool) -> Self {
        self.options.collapse_whitespace = value;
        self
    }

    /// Set `normalize_line_endings`.
    ///
    /// # Arguments
    /// - `value`: Optional target line-ending style.
    ///
    /// # Returns
    /// Updated builder.
    pub fn normalize_line_endings(mut self, value: Option<LineEndingStyle>) -> Self {
        self.options.normalize_line_endings = value;
        self
    }

    /// Set `unicode_normalization`.
    ///
    /// # Arguments
    /// - `mode`: Unicode normalization mode to apply before cleaning.
    ///
    /// # Returns
    /// Updated builder.
    pub fn unicode_normalization(mut self, mode: UnicodeNormalizationMode) -> Self {
        self.options.unicode_normalization = mode;
        self
    }

    #[cfg_attr(not(feature = "security"), doc(hidden))]
    /// Set `strip_bidi_controls`.
    ///
    /// # Arguments
    /// - `value`: Whether bidirectional controls are removed.
    ///
    /// # Returns
    /// Updated builder.
    pub fn strip_bidi_controls(mut self, value: bool) -> Self {
        self.options.strip_bidi_controls = value;
        self
    }

    /// Build an immutable [`CleaningOptions`] value.
    ///
    /// # Returns
    /// Finalized options struct.
    pub fn build(self) -> CleaningOptions {
        self.options
    }
}

/// Main cleaner.
pub struct TextCleaner {
    options: CleaningOptions,
}

impl TextCleaner {
    /// Create a cleaner from explicit options.
    ///
    /// # Arguments
    /// - `options`: Cleaning behavior configuration.
    ///
    /// # Returns
    /// A reusable [`TextCleaner`].
    pub fn new(options: CleaningOptions) -> Self {
        Self { options }
    }

    /// Borrow the cleaner options.
    ///
    /// # Returns
    /// Immutable reference to configured [`CleaningOptions`].
    pub fn options(&self) -> &CleaningOptions {
        &self.options
    }

    /// Clean text and panic on unavailable normalization features.
    ///
    /// # Arguments
    /// - `text`: Input text to normalize.
    ///
    /// # Returns
    /// Cleaned output and stats.
    ///
    /// # Errors
    /// This infallible wrapper does not return errors; use
    /// [`TextCleaner::try_clean`] for error handling.
    ///
    /// # Panics
    /// Panics when a normalization mode requires the `unorm` feature but it is
    /// not enabled.
    pub fn clean<'a>(&self, text: &'a str) -> CleaningResult<'a> {
        self.try_clean(text).unwrap_or_else(|err| {
            panic!(
                "clean() failed: {err}. Enable the 'unorm' feature or call try_clean() to handle the error"
            )
        })
    }

    /// Clean text into a caller-provided buffer and panic on unavailable
    /// normalization features.
    ///
    /// # Arguments
    /// - `text`: Input text to normalize.
    /// - `out`: Output buffer to reuse.
    ///
    /// # Returns
    /// A result borrowing from `out`.
    ///
    /// # Errors
    /// This infallible wrapper does not return errors; use
    /// [`TextCleaner::try_clean_into`] for error handling.
    ///
    /// # Panics
    /// Panics when a normalization mode requires the `unorm` feature but it is
    /// not enabled.
    pub fn clean_into<'output>(
        &self,
        text: &str,
        out: &'output mut String,
    ) -> CleaningResult<'output> {
        self.try_clean_into(text, out).unwrap_or_else(|err| {
            panic!(
                "clean_into() failed: {err}. Enable the 'unorm' feature or call try_clean_into() to handle the error"
            )
        })
    }

    /// Fallible variant of [`TextCleaner::clean`].
    ///
    /// # Arguments
    /// - `text`: Input text to normalize.
    ///
    /// # Returns
    /// Cleaned output and stats.
    ///
    /// # Errors
    /// Returns [`CleaningError::NormalizationUnavailable`] when normalization
    /// was requested without the `unorm` feature.
    pub fn try_clean<'a>(&self, text: &'a str) -> Result<CleaningResult<'a>, CleaningError> {
        self.try_clean_with_context(text, false)
    }

    /// Fallible variant of [`TextCleaner::clean_into`].
    ///
    /// # Arguments
    /// - `text`: Input text to normalize.
    /// - `out`: Output buffer to reuse.
    ///
    /// # Returns
    /// A result borrowing from `out`.
    ///
    /// # Errors
    /// Returns [`CleaningError::NormalizationUnavailable`] when normalization
    /// was requested without the `unorm` feature.
    pub fn try_clean_into<'output>(
        &self,
        text: &str,
        out: &'output mut String,
    ) -> Result<CleaningResult<'output>, CleaningError> {
        self.try_clean_into_with_context(text, out, false)
    }

    /// Clean text while preserving context about previously emitted output.
    ///
    /// # Arguments
    /// - `text`: Input chunk to clean.
    /// - `has_prior_output`: Whether earlier chunks already emitted output.
    ///
    /// # Returns
    /// Cleaned output and stats.
    ///
    /// # Errors
    /// Returns [`CleaningError::NormalizationUnavailable`] when normalization
    /// was requested without the `unorm` feature.
    pub fn try_clean_with_context<'a>(
        &self,
        text: &'a str,
        has_prior_output: bool,
    ) -> Result<CleaningResult<'a>, CleaningError> {
        if text.is_empty() {
            return Ok(CleaningResult {
                text: Cow::Borrowed(text),
                changes_made: 0,
                stats: CleaningStats::default(),
            });
        }

        if self.can_use_ascii_fast_path(text) {
            return Ok(CleaningResult {
                text: Cow::Borrowed(text),
                changes_made: 0,
                stats: CleaningStats::default(),
            });
        }

        let working = self.normalize_input(text)?;
        let mut buffer = String::with_capacity(working.len());
        let (changes, stats) = self.clean_into_internal(working, &mut buffer, has_prior_output);
        Ok(CleaningResult {
            text: Cow::Owned(buffer),
            changes_made: changes,
            stats,
        })
    }

    /// Buffer-reusing context-aware cleaner.
    ///
    /// # Arguments
    /// - `text`: Input chunk to clean.
    /// - `out`: Output buffer to reuse.
    /// - `has_prior_output`: Whether earlier chunks already emitted output.
    ///
    /// # Returns
    /// A result borrowing from `out`.
    ///
    /// # Errors
    /// Returns [`CleaningError::NormalizationUnavailable`] when normalization
    /// was requested without the `unorm` feature.
    pub fn try_clean_into_with_context<'output>(
        &self,
        text: &str,
        out: &'output mut String,
        has_prior_output: bool,
    ) -> Result<CleaningResult<'output>, CleaningError> {
        out.clear();

        if text.is_empty() {
            return Ok(CleaningResult {
                text: Cow::Borrowed(out.as_str()),
                changes_made: 0,
                stats: CleaningStats::default(),
            });
        }

        if self.can_use_ascii_fast_path(text) {
            out.push_str(text);
            return Ok(CleaningResult {
                text: Cow::Borrowed(out.as_str()),
                changes_made: 0,
                stats: CleaningStats::default(),
            });
        }

        let working = self.normalize_input(text)?;
        let (changes, stats) = self.clean_into_internal(working, out, has_prior_output);
        Ok(CleaningResult {
            text: Cow::Borrowed(out.as_str()),
            changes_made: changes,
            stats,
        })
    }

    fn clean_into_internal(
        &self,
        working_input: Cow<'_, str>,
        out: &mut String,
        has_prior_output: bool,
    ) -> (u64, CleaningStats) {
        let mut stats = CleaningStats::default();
        let mut changes = 0u64;

        let mut working = working_input;

        let mut line_ending_conversions = LineEndingCounts::default();
        if self.options.normalize_line_endings.is_some() {
            let (lf, counts) = to_lf(working.as_ref());
            line_ending_conversions = counts;
            working = Cow::Owned(lf);
        }

        out.clear();
        out.reserve(working.len());

        let mut pending_ws: usize = 0;
        let mut cap_next_whitespace = false;
        let mut drop_leading_whitespace = false;
        let mut emitted_anything = has_prior_output;
        let trim = self.options.remove_trailing_whitespace;
        let collapse = self.options.collapse_whitespace;

        let mut emoji_classifier: Option<EmojiClassifier> = None;
        let default_ignorables = CodePointSetData::new::<props::DefaultIgnorableCodePoint>();
        let mut cluster_buffer = String::new();
        #[cfg(feature = "security")]
        let bidi_controls: Option<CodePointSetDataBorrowed<'static>> =
            if self.options.strip_bidi_controls {
                Some(CodePointSetData::new::<props::BidiControl>())
            } else {
                None
            };

        for grapheme in UnicodeSegmentation::graphemes(working.as_ref(), true) {
            if grapheme.is_empty() {
                continue;
            }

            if is_newline_grapheme(grapheme) {
                if trim {
                    if pending_ws > 0 {
                        record_change!(changes, stats, trailing_whitespace_removed, pending_ws);
                        pending_ws = 0;
                        cap_next_whitespace = false;
                    }
                } else {
                    flush_pending_whitespace(out, pending_ws, collapse);
                    pending_ws = 0;
                    cap_next_whitespace = false;
                }
                out.push_str(grapheme);
                emitted_anything = true;
                drop_leading_whitespace = false;
                continue;
            }

            let mut emoji_cluster_cache: Option<bool> = None;
            let mut ensure_emoji_cluster = |classifier: &mut Option<EmojiClassifier>| -> bool {
                if let Some(value) = emoji_cluster_cache {
                    return value;
                }
                if grapheme.is_ascii() {
                    emoji_cluster_cache = Some(false);
                    return false;
                }
                if !self.options.keyboard_only && !self.options.remove_hidden {
                    emoji_cluster_cache = Some(false);
                    return false;
                }
                let classifier = classifier.get_or_insert_with(EmojiClassifier::new);
                let value = classify_emoji_cluster(grapheme, classifier).is_rendered;
                emoji_cluster_cache = Some(value);
                value
            };

            cluster_buffer.clear();
            cluster_buffer.reserve(grapheme.len());
            let mut emitted_directly = false;

            for mut c in grapheme.chars() {
                #[cfg(feature = "security")]
                if let Some(set) = bidi_controls {
                    if set.contains(c) {
                        record_change!(changes, stats, bidi_controls_removed);
                        continue;
                    }
                }

                if self.options.remove_hidden && default_ignorables.contains(c) {
                    let keep_hidden = (self.options.preserve_joiners && is_joiner(c))
                        || ((!self.options.keyboard_only
                            || matches!(self.options.emoji_policy, EmojiPolicy::Keep))
                            && ensure_emoji_cluster(&mut emoji_classifier));
                    if keep_hidden {
                        cluster_buffer.push(c);
                    } else {
                        record_change!(changes, stats, hidden_chars_removed);
                    }
                    continue;
                }

                if self.options.remove_control_chars && is_disallowed_control(c) {
                    record_change!(changes, stats, control_chars_removed);
                    continue;
                }

                if self.options.normalize_spaces {
                    if let Some(&mapped) = SPACE_MAP.get(&c) {
                        record_change!(changes, stats, spaces_normalized);
                        c = mapped;
                    }
                }

                if self.options.normalize_dashes {
                    if let Some(mapped) = map_dash(c) {
                        if mapped != c {
                            record_change!(changes, stats, dashes_normalized);
                        }
                        c = mapped;
                    }
                }

                if self.options.normalize_quotes {
                    if let Some(mapped) = map_quote(c) {
                        if mapped != c {
                            record_change!(changes, stats, quotes_normalized);
                        }
                        c = mapped;
                    }
                }

                if self.options.normalize_other {
                    match c {
                        FRACTION_SLASH => {
                            c = '/';
                            record_change!(changes, stats, other_normalized);
                        }
                        HORIZONTAL_ELLIPSIS | MIDLINE_HORIZONTAL_ELLIPSIS => {
                            if pending_ws > 0 {
                                if drop_leading_whitespace && !emitted_anything {
                                    pending_ws = 0;
                                } else {
                                    flush_pending_whitespace(out, pending_ws, collapse);
                                    pending_ws = 0;
                                }
                            }
                            out.push_str("...");
                            emitted_anything = true;
                            drop_leading_whitespace = false;
                            record_change!(changes, stats, other_normalized);
                            emitted_directly = true;
                            break;
                        }
                        _ => {}
                    }
                }

                cluster_buffer.push(c);
            }

            if emitted_directly {
                continue;
            }

            if cluster_buffer.is_empty() {
                continue;
            }

            if cluster_buffer.chars().all(|ch| matches!(ch, ' ' | '\t')) {
                let count = cluster_buffer.chars().count();
                if cap_next_whitespace {
                    pending_ws = 1;
                    cap_next_whitespace = false;
                } else {
                    pending_ws = pending_ws.saturating_add(count);
                }
                continue;
            }

            if self.options.keyboard_only {
                let is_emoji_cluster = ensure_emoji_cluster(&mut emoji_classifier);
                if is_emoji_cluster && matches!(self.options.emoji_policy, EmojiPolicy::Keep) {
                    if pending_ws > 0 {
                        if drop_leading_whitespace && !emitted_anything {
                            pending_ws = 0;
                        } else {
                            flush_pending_whitespace(out, pending_ws, collapse);
                            pending_ws = 0;
                        }
                    }
                    out.push_str(&cluster_buffer);
                    emitted_anything = true;
                    drop_leading_whitespace = false;
                    continue;
                }

                if let Some(rewrite) = rewrite_cluster_to_keyboard_ascii(
                    &cluster_buffer,
                    self.options.non_ascii_policy,
                    self.options.extended_keyboard,
                ) {
                    if rewrite.non_ascii_transliterated > 0 {
                        record_change!(
                            changes,
                            stats,
                            non_keyboard_transliterated,
                            rewrite.non_ascii_transliterated
                        );
                    }
                    if rewrite.non_ascii_removed > 0 {
                        record_change!(
                            changes,
                            stats,
                            non_keyboard_removed,
                            rewrite.non_ascii_removed
                        );
                    }
                    cluster_buffer = rewrite.text;

                    if pending_ws > 0 {
                        if drop_leading_whitespace && !emitted_anything {
                            pending_ws = 0;
                        } else {
                            flush_pending_whitespace(out, pending_ws, collapse);
                            pending_ws = 0;
                        }
                    }
                    out.push_str(&cluster_buffer);
                    emitted_anything = true;
                    drop_leading_whitespace = false;
                } else if is_emoji_cluster {
                    record_change!(changes, stats, emojis_dropped);
                    cluster_buffer.clear();
                    cap_next_whitespace = true;
                    drop_leading_whitespace = pending_ws == 0 && !emitted_anything;
                    if pending_ws > 0 {
                        pending_ws = 1;
                    }
                } else {
                    let removed = cluster_buffer
                        .chars()
                        .filter(|c| !is_keyboard_allowed(*c, self.options.extended_keyboard))
                        .count();
                    if removed > 0 {
                        record_change!(changes, stats, non_keyboard_removed, removed);
                    }
                    cluster_buffer.clear();
                    cap_next_whitespace = true;
                    drop_leading_whitespace = pending_ws == 0 && !emitted_anything;
                    if pending_ws > 0 {
                        pending_ws = 1;
                    }
                }
            } else {
                if pending_ws > 0 {
                    if drop_leading_whitespace && !emitted_anything {
                        pending_ws = 0;
                    } else {
                        flush_pending_whitespace(out, pending_ws, collapse);
                        pending_ws = 0;
                    }
                }
                out.push_str(&cluster_buffer);
                emitted_anything = true;
                drop_leading_whitespace = false;
            }
        }

        if trim {
            if pending_ws > 0 {
                record_change!(changes, stats, trailing_whitespace_removed, pending_ws);
            }
        } else if pending_ws > 0 {
            if drop_leading_whitespace && !emitted_anything {
                // drop leading whitespace that only existed due to removed clusters
            } else {
                flush_pending_whitespace(out, pending_ws, collapse);
            }
        }

        match self.options.normalize_line_endings {
            Some(LineEndingStyle::Lf) => {
                let total = line_ending_conversions.total();
                if total > 0 {
                    record_change!(changes, stats, line_endings_normalized, total);
                }
            }
            Some(style @ (LineEndingStyle::Crlf | LineEndingStyle::Cr)) => {
                let restamp_changes = restamp_line_endings_mut(style, out);
                let baseline = match style {
                    LineEndingStyle::Crlf => line_ending_conversions.crlf,
                    LineEndingStyle::Cr => line_ending_conversions.cr,
                    LineEndingStyle::Lf => unreachable!(),
                };
                let net = restamp_changes.saturating_sub(baseline);
                if net > 0 {
                    record_change!(changes, stats, line_endings_normalized, net);
                }
            }
            None => {}
        }

        (changes, stats)
    }

    fn normalize_input<'a>(&self, text: &'a str) -> Result<Cow<'a, str>, CleaningError> {
        match self.options.unicode_normalization {
            UnicodeNormalizationMode::None => Ok(Cow::Borrowed(text)),
            #[cfg(feature = "unorm")]
            UnicodeNormalizationMode::NFD => Ok(Cow::Owned(text.nfd().collect())),
            #[cfg(feature = "unorm")]
            UnicodeNormalizationMode::NFC => Ok(Cow::Owned(text.nfc().collect())),
            #[cfg(feature = "unorm")]
            UnicodeNormalizationMode::NFKD => Ok(Cow::Owned(text.nfkd().collect())),
            #[cfg(feature = "unorm")]
            UnicodeNormalizationMode::NFKC => Ok(Cow::Owned(text.nfkc().collect())),
            #[cfg(not(feature = "unorm"))]
            mode => Err(CleaningError::NormalizationUnavailable { requested: mode }),
        }
    }

    fn can_use_ascii_fast_path(&self, text: &str) -> bool {
        text.is_ascii()
            && !self.options.remove_trailing_whitespace
            && !self.options.collapse_whitespace
            && self.options.normalize_line_endings.is_none()
            && !self.options.remove_control_chars
            && matches!(
                self.options.unicode_normalization,
                UnicodeNormalizationMode::None
            )
    }
}

#[derive(Debug, Clone)]
/// Summary of cumulative streaming cleanup work.
pub struct StreamSummary {
    /// Aggregated counters over all emitted chunks.
    pub stats: CleaningStats,
    /// Total transformations across all emitted chunks.
    pub changes_made: u64,
}

/// Incremental cleaner that processes text in newline-delimited chunks.
pub struct StreamCleaner {
    cleaner: TextCleaner,
    buffer: String,
    total_stats: CleaningStats,
    total_changes: u64,
    has_emitted_output: bool,
}

impl StreamCleaner {
    /// Construct a streaming cleaner from options.
    ///
    /// # Arguments
    /// - `options`: Cleaning behavior configuration.
    ///
    /// # Returns
    /// A new [`StreamCleaner`].
    pub fn new(options: CleaningOptions) -> Self {
        Self {
            cleaner: TextCleaner::new(options),
            buffer: String::new(),
            total_stats: CleaningStats::default(),
            total_changes: 0,
            has_emitted_output: false,
        }
    }

    /// Construct a streaming cleaner from an existing [`TextCleaner`].
    ///
    /// # Arguments
    /// - `cleaner`: Preconfigured text cleaner.
    ///
    /// # Returns
    /// A new [`StreamCleaner`].
    pub fn from_cleaner(cleaner: TextCleaner) -> Self {
        Self {
            cleaner,
            buffer: String::new(),
            total_stats: CleaningStats::default(),
            total_changes: 0,
            has_emitted_output: false,
        }
    }

    /// Feed one input chunk and emit cleaned output only after a newline
    /// boundary is available.
    ///
    /// # Arguments
    /// - `chunk`: Incoming text data.
    /// - `out`: Reusable output buffer.
    ///
    /// # Returns
    /// `Some(CleaningResult)` when at least one complete line was processed,
    /// otherwise `None`.
    ///
    /// # Panics
    /// Panics when normalization is requested but the `unorm` feature is disabled.
    pub fn feed<'out>(
        &mut self,
        chunk: &str,
        out: &'out mut String,
    ) -> Option<CleaningResult<'out>> {
        out.clear();
        if chunk.is_empty() {
            return None;
        }
        self.buffer.push_str(chunk);
        let last_nl = self.buffer.rfind('\n')?;
        let flush_end = last_nl + 1;
        let to_process = self.buffer[..flush_end].to_owned();
        self.buffer.drain(..flush_end);
        Some(self.process_owned_chunk(to_process, out))
    }

    /// Flush remaining buffered text at end-of-stream.
    ///
    /// # Arguments
    /// - `out`: Reusable output buffer.
    ///
    /// # Returns
    /// Final cleaned chunk when buffered content remains, otherwise `None`.
    pub fn finish<'out>(&mut self, out: &'out mut String) -> Option<CleaningResult<'out>> {
        out.clear();
        if self.buffer.is_empty() {
            return None;
        }
        let remainder = std::mem::take(&mut self.buffer);
        Some(self.process_owned_chunk(remainder, out))
    }

    /// Return cumulative stream statistics.
    ///
    /// # Returns
    /// Aggregate counters and total change count.
    pub fn summary(&self) -> StreamSummary {
        StreamSummary {
            stats: self.total_stats.clone(),
            changes_made: self.total_changes,
        }
    }

    fn process_owned_chunk<'out>(
        &mut self,
        chunk: String,
        out: &'out mut String,
    ) -> CleaningResult<'out> {
        let result = self
            .cleaner
            .try_clean_into_with_context(&chunk, out, self.has_emitted_output)
            .unwrap_or_else(|err| {
                panic!(
                    "StreamCleaner::feed failed: {err}. Enable the 'unorm' feature or use try_clean"
                )
            });
        let emitted = result.text.as_ref();

        self.total_stats.accumulate(&result.stats);
        self.total_changes = self.total_changes.saturating_add(result.changes_made);
        if !emitted.is_empty() {
            self.has_emitted_output = true;
        }

        result
    }
}

#[derive(Clone, Copy)]
struct EmojiClusterContext {
    is_rendered: bool,
}

struct EmojiClassifier {
    emoji: CodePointSetDataBorrowed<'static>,
    emoji_presentation: CodePointSetDataBorrowed<'static>,
    extended_pictographic: CodePointSetDataBorrowed<'static>,
}

impl EmojiClassifier {
    fn new() -> Self {
        Self {
            emoji: CodePointSetData::new::<props::Emoji>(),
            emoji_presentation: CodePointSetData::new::<props::EmojiPresentation>(),
            extended_pictographic: CodePointSetData::new::<props::ExtendedPictographic>(),
        }
    }
}

fn classify_emoji_cluster(grapheme: &str, classifier: &EmojiClassifier) -> EmojiClusterContext {
    let mut has_emoji_presentation = false;
    let mut has_extended_pictographic = false;
    let mut has_emoji = false;
    let mut has_vs16 = false;
    let mut has_zwj = false;
    let mut has_keycap = false;

    for c in grapheme.chars() {
        if classifier.emoji_presentation.contains(c) {
            has_emoji_presentation = true;
        }
        if classifier.extended_pictographic.contains(c) {
            has_extended_pictographic = true;
        }
        if classifier.emoji.contains(c) {
            has_emoji = true;
        }
        match c {
            '\u{FE0F}' => has_vs16 = true,   // Variation Selector-16
            '\u{200D}' => has_zwj = true,    // Zero Width Joiner
            '\u{20E3}' => has_keycap = true, // Combining Enclosing Keycap
            _ => {}
        }
    }

    let is_rendered = has_emoji_presentation
        || has_extended_pictographic
        || (has_emoji && (has_vs16 || has_zwj || has_keycap));

    EmojiClusterContext { is_rendered }
}

fn flush_pending_whitespace(out: &mut String, pending: usize, collapse: bool) {
    if pending == 0 {
        return;
    }
    if collapse {
        out.push(' ');
    } else {
        for _ in 0..pending {
            out.push(' ');
        }
    }
}

fn is_disallowed_control(c: char) -> bool {
    let cu = c as u32;
    ((cu <= 0x1F) || (0x7F..=0x9F).contains(&cu)) && c != '\n' && c != '\r' && c != '\t'
}

fn is_newline_grapheme(g: &str) -> bool {
    matches!(g, "\n" | "\r" | "\r\n")
}

fn is_joiner(c: char) -> bool {
    matches!(c, '\u{200C}' | '\u{200D}')
}

fn is_keyboard_allowed(c: char, extended_keyboard: bool) -> bool {
    is_keyboard_ascii(c) || (extended_keyboard && is_extended_keyboard_char(c))
}

#[derive(Debug, Default, Clone, Copy)]
struct LineEndingCounts {
    crlf: u64,
    cr: u64,
    nel: u64,
    ls: u64,
    ps: u64,
}

impl LineEndingCounts {
    fn total(&self) -> u64 {
        self.crlf + self.cr + self.nel + self.ls + self.ps
    }
}

// ----------------- helpers -----------------

fn to_lf(s: &str) -> (String, LineEndingCounts) {
    // Convert CRLF, CR, NEL (U+0085), LS (U+2028) and PS (U+2029) to LF and track conversions.
    let mut out = String::with_capacity(s.len());
    let mut counts = LineEndingCounts::default();
    let mut it = s.chars().peekable();
    while let Some(c) = it.next() {
        if c == '\r' {
            if matches!(it.peek(), Some('\n')) {
                it.next(); // consume LF
                counts.crlf = counts.crlf.saturating_add(1);
            } else {
                counts.cr = counts.cr.saturating_add(1);
            }
            out.push('\n');
        } else if c == '\u{0085}' {
            out.push('\n');
            counts.nel = counts.nel.saturating_add(1);
        } else if c == '\u{2028}' {
            out.push('\n');
            counts.ls = counts.ls.saturating_add(1);
        } else if c == '\u{2029}' {
            out.push('\n');
            counts.ps = counts.ps.saturating_add(1);
        } else {
            out.push(c);
        }
    }
    (out, counts)
}

fn restamp_line_endings_mut(style: LineEndingStyle, text: &mut String) -> u64 {
    match style {
        LineEndingStyle::Lf => 0,
        LineEndingStyle::Crlf => {
            let lf_count = text.as_bytes().iter().filter(|&&b| b == b'\n').count() as u64;
            if lf_count > 0 {
                let restamped = text.replace('\n', "\r\n");
                *text = restamped;
            }
            lf_count
        }
        LineEndingStyle::Cr => {
            let lf_count = text.as_bytes().iter().filter(|&&b| b == b'\n').count() as u64;
            if lf_count > 0 {
                let restamped = text.replace('\n', "\r");
                *text = restamped;
            }
            lf_count
        }
    }
}

fn map_dash(c: char) -> Option<char> {
    DASH_MAP.get(&c).copied()
}

fn map_quote(c: char) -> Option<char> {
    QUOTE_MAP.get(&c).copied()
}

#[derive(Debug)]
struct KeyboardAsciiRewrite {
    text: String,
    non_ascii_transliterated: u64,
    non_ascii_removed: u64,
}

fn rewrite_cluster_to_keyboard_ascii(
    cluster: &str,
    policy: NonAsciiPolicy,
    extended_keyboard: bool,
) -> Option<KeyboardAsciiRewrite> {
    let mut out = String::with_capacity(cluster.len());
    let mut non_ascii_transliterated = 0u64;
    let mut non_ascii_removed = 0u64;

    for c in cluster.chars() {
        if is_keyboard_allowed(c, extended_keyboard) {
            out.push(c);
            continue;
        }

        let mapped = match policy {
            NonAsciiPolicy::Drop => false,
            NonAsciiPolicy::Fold => append_folded_non_ascii(c, &mut out, extended_keyboard),
            NonAsciiPolicy::Transliterate => {
                append_folded_non_ascii(c, &mut out, extended_keyboard)
                    || append_transliterated_non_ascii(c, &mut out, extended_keyboard)
            }
        };

        if mapped {
            non_ascii_transliterated = non_ascii_transliterated.saturating_add(1);
        } else {
            non_ascii_removed = non_ascii_removed.saturating_add(1);
        }
    }

    if out.is_empty() {
        None
    } else {
        Some(KeyboardAsciiRewrite {
            text: out,
            non_ascii_transliterated,
            non_ascii_removed,
        })
    }
}

#[cfg(feature = "unorm")]
fn append_folded_non_ascii(c: char, out: &mut String, extended_keyboard: bool) -> bool {
    let mut added = false;
    for decomposed in c.to_string().nfkd() {
        if is_keyboard_allowed(decomposed, extended_keyboard) {
            out.push(decomposed);
            added = true;
        } else if let Some(mapped) = map_compatibility_ascii(decomposed) {
            out.push(mapped);
            added = true;
        }
    }
    added
}

#[cfg(not(feature = "unorm"))]
fn append_folded_non_ascii(c: char, out: &mut String, _: bool) -> bool {
    if let Some(mapped) = map_compatibility_ascii(c) {
        out.push(mapped);
        true
    } else {
        false
    }
}

fn append_transliterated_non_ascii(c: char, out: &mut String, extended_keyboard: bool) -> bool {
    let before = out.len();
    if let Some(override_mapping) = transliteration_override(c) {
        append_ascii_mapping(override_mapping, out, extended_keyboard);
        return out.len() > before;
    }

    if is_latin_transliteration_candidate(c) {
        if let Some(mapped) = deunicode_char(c) {
            append_ascii_mapping(mapped, out, extended_keyboard);
        }
    }
    out.len() > before
}

fn is_latin_transliteration_candidate(c: char) -> bool {
    matches!(
        c as u32,
        0x00C0..=0x024F
            | 0x1E00..=0x1EFF
            | 0x2C60..=0x2C7F
            | 0xA720..=0xA7FF
            | 0xAB30..=0xAB6F
            | 0x10780..=0x107BF
            | 0x1DF00..=0x1DFFF
    )
}

fn append_ascii_mapping(mapped: &str, out: &mut String, extended_keyboard: bool) {
    for c in mapped.chars() {
        if is_keyboard_allowed(c, extended_keyboard) {
            out.push(c);
        } else if let Some(compat) = map_compatibility_ascii(c) {
            out.push(compat);
        }
    }
}

fn transliteration_override(c: char) -> Option<&'static str> {
    match c {
        'ß' => Some("ss"),
        'ẞ' => Some("SS"),
        _ => None,
    }
}

fn map_compatibility_ascii(c: char) -> Option<char> {
    match c {
        FRACTION_SLASH => Some('/'),
        _ => None,
    }
}

/// Convenience: clean with default options.
///
/// # Arguments
/// - `text`: Input text to clean.
///
/// # Returns
/// Cleaned output and statistics.
///
/// The default preset emits keyboard-safe output. Non-ASCII text is
/// normalized/folded and transliterated when possible (for example
/// `"Café"` -> `"Cafe"`, `"Straße"` -> `"Strasse"`), while characters
/// with no feasible ASCII mapping are removed.
///
/// # Errors
/// This infallible wrapper does not return errors; construct a
/// [`TextCleaner`] and call [`TextCleaner::try_clean`] for fallible behavior.
///
/// # Panics
/// Panics when normalization is requested but the `unorm` feature is disabled.
pub fn clean(text: &str) -> CleaningResult<'_> {
    TextCleaner::new(CleaningOptions::default()).clean(text)
}

/// Convenience: clean with the humanize preset.
///
/// # Arguments
/// - `text`: Input text to clean.
///
/// # Returns
/// Cleaned output and statistics.
///
/// # Errors
/// This infallible wrapper does not return errors; construct a
/// [`TextCleaner`] and call [`TextCleaner::try_clean`] for error handling.
///
/// # Panics
/// Panics when normalization is requested but the `unorm` feature is disabled.
pub fn humanize(text: &str) -> CleaningResult<'_> {
    TextCleaner::new(CleaningOptions::humanize()).clean(text)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn removes_hidden() {
        let c = TextCleaner::new(CleaningOptions {
            remove_hidden: true,
            ..CleaningOptions::minimal()
        });
        let out = c.clean("Hello\u{200B}World");
        assert_eq!(out.text, "HelloWorld");
        assert_eq!(out.stats.hidden_chars_removed, 1);
    }

    #[test]
    fn removes_mongolian_vowel_separator() {
        let c = TextCleaner::new(CleaningOptions::default());
        let out = c.clean("Hello\u{180E}World");
        assert_eq!(out.text, "HelloWorld");
        assert!(out.stats.hidden_chars_removed >= 1);
    }

    #[test]
    fn normalizes_spaces_and_dashes_quotes_and_ellipsis() {
        let c = TextCleaner::new(CleaningOptions::default());
        let out = c.clean("\u{201C}Hi\u{201D}\u{00A0}\u{2014} ok…");
        assert_eq!(out.text, "\"Hi\" - ok...");
        assert!(out.stats.spaces_normalized >= 1);
        assert!(out.stats.dashes_normalized >= 1);
        assert!(out.stats.quotes_normalized >= 2);
        assert!(out.stats.other_normalized >= 1);
    }

    #[test]
    fn trims_trailing_ws() {
        let c = TextCleaner::new(CleaningOptions {
            remove_trailing_whitespace: true,
            ..CleaningOptions::minimal()
        });
        let out = c.clean("a  \n b\t\t\n");
        assert_eq!(out.text, "a\n b\n");
        assert!(out.stats.trailing_whitespace_removed >= 3);
    }

    #[test]
    fn collapses_ws() {
        let c = TextCleaner::new(CleaningOptions {
            collapse_whitespace: true,
            ..CleaningOptions::minimal()
        });
        let out = c.clean("a    b\t\tc");
        assert_eq!(out.text, "a b c");
    }

    #[test]
    fn keyboard_only_with_emoji_policy() {
        let c = TextCleaner::new(CleaningOptions {
            keyboard_only: true,
            emoji_policy: EmojiPolicy::Keep,
            ..CleaningOptions::minimal()
        });
        let out = c.clean("Hello😀世界");
        assert_eq!(out.text, "Hello😀");
        assert!(out.stats.non_keyboard_removed >= 2);
    }

    #[test]
    fn normalize_eol_crlf_to_lf_and_back() {
        let c = TextCleaner::new(CleaningOptions {
            normalize_line_endings: Some(LineEndingStyle::Lf),
            ..CleaningOptions::minimal()
        });
        let out = c.clean("a\r\nb\rc\u{0085}");
        assert_eq!(out.text, "a\nb\nc\n");
        assert!(out.stats.line_endings_normalized >= 3);
    }

    #[test]
    fn normalizes_unicode_line_separators() {
        let mut options = CleaningOptions::minimal();
        options.normalize_line_endings = Some(LineEndingStyle::Lf);
        let c = TextCleaner::new(options);
        let out = c.clean("a\u{2028}b\u{2029}c");
        assert_eq!(out.text, "a\nb\nc");
        assert_eq!(out.stats.line_endings_normalized, 2);
    }

    #[test]
    fn restamping_counts_changes() {
        let mut options = CleaningOptions::minimal();
        options.normalize_line_endings = Some(LineEndingStyle::Crlf);
        let c = TextCleaner::new(options);
        let out = c.clean("a\nb\n");
        assert_eq!(out.text, "a\r\nb\r\n");
        assert_eq!(out.stats.line_endings_normalized, 2);
    }

    #[test]
    fn default_cleaning_matches_keyboard_equivalent() {
        let out = clean("“Hello—world…”\u{00A0}😀");
        assert_eq!(out.text, "\"Hello-world...\"");
        assert_eq!(out.stats.quotes_normalized, 2);
        assert_eq!(out.stats.dashes_normalized, 1);
        assert_eq!(out.stats.other_normalized, 1);
        assert_eq!(out.stats.spaces_normalized, 1);
        assert_eq!(out.stats.emojis_dropped, 1);
        assert_eq!(out.changes_made, 7);
    }

    #[test]
    fn keyboard_only_drops_non_ascii_and_emoji() {
        let cleaner = TextCleaner::new(CleaningOptions {
            keyboard_only: true,
            ..CleaningOptions::default()
        });
        let out = cleaner.clean("Ascii😀世界");
        assert_eq!(out.text, "Ascii");
        assert_eq!(out.stats.emojis_dropped, 1);
        assert!(out.stats.non_keyboard_removed >= 2);
    }

    #[test]
    fn keyboard_only_folds_latin_diacritics_to_ascii() {
        let cleaner = TextCleaner::new(CleaningOptions::default());
        let out = cleaner.clean("Caf\u{00E9} d\u{00E9}j\u{00E0} vu");
        assert_eq!(out.text, "Cafe deja vu");
        assert!(out.stats.non_keyboard_transliterated >= 3);
    }

    #[test]
    fn transliterates_non_decomposing_latin_letters() {
        let cleaner = TextCleaner::new(CleaningOptions::default());
        let out = cleaner.clean("Stra\u{00DF}e \u{00C6}sir \u{00F8}l \u{0153}uvre");
        assert_eq!(out.text, "Strasse AEsir ol oeuvre");
        assert!(out.stats.non_keyboard_transliterated >= 4);
    }

    #[test]
    fn non_ascii_policy_modes_control_behavior() {
        let drop = TextCleaner::new(
            CleaningOptions::builder()
                .non_ascii_policy(NonAsciiPolicy::Drop)
                .build(),
        )
        .clean("Stra\u{00DF}e \u{00BD} \u{2122}");
        assert_eq!(drop.text, "Strae");

        let fold = TextCleaner::new(
            CleaningOptions::builder()
                .non_ascii_policy(NonAsciiPolicy::Fold)
                .build(),
        )
        .clean("Stra\u{00DF}e \u{00BD} \u{2122}");
        assert_eq!(fold.text, "Strae 1/2 TM");

        let transliterate = TextCleaner::new(
            CleaningOptions::builder()
                .non_ascii_policy(NonAsciiPolicy::Transliterate)
                .build(),
        )
        .clean("Stra\u{00DF}e \u{00BD} \u{2122}");
        assert_eq!(transliterate.text, "Strasse 1/2 TM");
        assert!(transliterate.stats.non_keyboard_transliterated >= 3);
    }

    #[test]
    fn extended_keyboard_allowlist_can_preserve_curated_symbols() {
        let default = TextCleaner::new(
            CleaningOptions::builder()
                .non_ascii_policy(NonAsciiPolicy::Drop)
                .build(),
        )
        .clean("€ and ™");
        assert_eq!(default.text, "and");

        let extended = TextCleaner::new(
            CleaningOptions::builder()
                .extended_keyboard(true)
                .non_ascii_policy(NonAsciiPolicy::Drop)
                .build(),
        )
        .clean("€ and ™");
        assert_eq!(extended.text, "€ and");
    }

    #[test]
    fn preserve_joiners_toggle_controls_zwj_zwnj_retention() {
        let text = "می\u{200C}خواهم";
        let default =
            TextCleaner::new(CleaningOptions::builder().keyboard_only(false).build()).clean(text);
        assert!(!default.text.contains('\u{200C}'));

        let preserved = TextCleaner::new(
            CleaningOptions::builder()
                .keyboard_only(false)
                .preserve_joiners(true)
                .build(),
        )
        .clean(text);
        assert!(preserved.text.contains('\u{200C}'));
    }

    #[test]
    fn ts_whitespace_scenarios() {
        let input = "Hello\u{200B}\u{00A0}World!  ";

        let cleaner = TextCleaner::new(CleaningOptions::default());
        let out = cleaner.clean(input);
        assert_eq!(out.text, "Hello World!");
        assert_eq!(out.changes_made, 4);

        let cleaner = TextCleaner::new(CleaningOptions {
            remove_trailing_whitespace: false,
            ..CleaningOptions::default()
        });
        let out = cleaner.clean(input);
        assert_eq!(out.text, "Hello World!  ");
        assert_eq!(out.changes_made, 2);

        let cleaner = TextCleaner::new(CleaningOptions {
            remove_hidden: false,
            keyboard_only: false,
            ..CleaningOptions::default()
        });
        let out = cleaner.clean(input);
        assert_eq!(out.text, "Hello\u{200B} World!");
        assert_eq!(out.changes_made, 3);

        let cleaner = TextCleaner::new(CleaningOptions {
            normalize_spaces: false,
            keyboard_only: false,
            ..CleaningOptions::default()
        });
        let out = cleaner.clean(input);
        assert_eq!(out.text, "Hello\u{00A0}World!");
        assert_eq!(out.changes_made, 3);
    }

    #[cfg(feature = "security")]
    #[test]
    fn strips_bidi_controls_when_enabled() {
        let options = CleaningOptions {
            strip_bidi_controls: true,
            ..CleaningOptions::default()
        };
        let cleaner = TextCleaner::new(options);
        let out = cleaner.clean("\u{202E}ab\u{202C}c");
        assert_eq!(out.text, "abc");
        assert!(out.changes_made >= 2);
        #[cfg(feature = "stats")]
        {
            assert!(out.stats.bidi_controls_removed >= 2);
        }
    }

    #[test]
    fn ts_dashes_case() {
        let cleaner = TextCleaner::new(CleaningOptions::default());
        let out = cleaner.clean("I — super — man – 💪");
        assert_eq!(out.text, "I - super - man -");
        assert_eq!(out.stats.dashes_normalized, 3);
        assert_eq!(out.stats.emojis_dropped, 1);
        assert_eq!(out.changes_made, 5);
    }

    #[test]
    fn ts_quotes_case() {
        let cleaner = TextCleaner::new(CleaningOptions::default());
        let out = cleaner.clean("Angular “quote” «marks» looks„ like Christmas «« tree");
        assert_eq!(
            out.text,
            "Angular \"quote\" \"marks\" looks\" like Christmas \"\" tree"
        );
        assert_eq!(out.stats.quotes_normalized, 7);
        assert_eq!(out.changes_made, 7);
    }

    #[test]
    fn maps_additional_quotes_and_primes() {
        let cleaner = TextCleaner::new(CleaningOptions::default());
        let out = cleaner.clean("‹left› ‟double‟ ′prime′ ″double″");
        assert_eq!(out.text, "'left' \"double\" 'prime' \"double\"");
        assert!(out.stats.quotes_normalized >= 6);
    }

    #[test]
    fn minus_sign_normalizes() {
        let cleaner = TextCleaner::new(CleaningOptions::default());
        let out = cleaner.clean("5 \u{2212} 3");
        assert_eq!(out.text, "5 - 3");
        assert!(out.stats.dashes_normalized >= 1);
    }

    #[test]
    fn narrow_nbsp_is_normalized() {
        let cleaner = TextCleaner::new(CleaningOptions::default());
        let out = cleaner.clean("5\u{202F}MB");
        assert_eq!(out.text, "5 MB");
        assert_eq!(out.stats.spaces_normalized, 1);
        assert_eq!(out.changes_made, 1);
    }

    #[test]
    fn every_space_like_char_collapses_to_ascii_space() {
        let cleaner = TextCleaner::new(CleaningOptions::default());
        let mut samples = vec!['\u{00A0}', '\u{1680}'];
        samples.extend((0x2000..=0x200A).filter_map(std::char::from_u32));
        samples.push('\u{202F}');
        samples.push('\u{205F}');
        samples.push('\u{3000}');

        for ch in samples {
            let input = format!("a{ch}b");
            let out = cleaner.clean(&input);
            assert_eq!(out.text, "a b", "failed for U+{:04X}", ch as u32);
            assert_eq!(
                out.stats.spaces_normalized, 1,
                "expected a single normalization for U+{:04X}",
                ch as u32
            );
        }
    }

    #[test]
    fn fraction_slash_maps_to_ascii() {
        let cleaner = TextCleaner::new(CleaningOptions::default());
        let out = cleaner.clean("1\u{2044}2");
        assert_eq!(out.text, "1/2");
        assert_eq!(out.stats.other_normalized, 1);
    }

    #[test]
    fn keeps_variation_selector_for_emoji() {
        let cleaner = TextCleaner::new(CleaningOptions {
            keyboard_only: false,
            ..CleaningOptions::default()
        });
        let out = cleaner.clean("👍\u{FE0F}");
        assert_eq!(out.text, "👍\u{FE0F}");
        assert_eq!(out.stats.hidden_chars_removed, 0);
    }

    #[test]
    fn drops_emoji_sequence_when_policy_drop() {
        let cleaner = TextCleaner::new(CleaningOptions {
            keyboard_only: true,
            emoji_policy: EmojiPolicy::Drop,
            ..CleaningOptions::default()
        });
        let out = cleaner.clean("👍\u{FE0F}");
        assert_eq!(out.text, "");
        assert_eq!(out.stats.emojis_dropped, 1);
    }

    #[test]
    fn code_safe_preset_fields_match_cli_contract() {
        let options = CleaningOptions::code_safe();
        assert!(options.remove_hidden);
        assert!(options.remove_trailing_whitespace);
        assert!(options.normalize_spaces);
        assert!(!options.normalize_dashes);
        assert!(!options.normalize_quotes);
        assert!(!options.normalize_other);
        assert!(!options.keyboard_only);
        assert_eq!(options.emoji_policy, EmojiPolicy::Keep);
        assert_eq!(options.non_ascii_policy, NonAsciiPolicy::Transliterate);
        assert!(options.preserve_joiners);
        assert!(options.remove_control_chars);
        assert!(!options.collapse_whitespace);
        assert_eq!(options.normalize_line_endings, None);
        assert_eq!(
            options.unicode_normalization,
            UnicodeNormalizationMode::None
        );
    }
}