whisper-guard 0.3.0

//! Post-transcription segment cleaning for whisper output.
//!
//! Whisper's decoder can hallucinate in several patterns:
//! - **Consecutive repetition**: the same phrase repeated 5-50 times
//! - **Interleaved repetition**: A/B/A/B patterns with filler words between
//! - **Trailing noise**: `[music]`, `[BLANK_AUDIO]` tags after speech ends
//!
//! This module detects and removes all three patterns. The main entry point
//! is [`clean_transcript`], which chains all cleaning passes.

/// Extract the text portion after the timestamp bracket.
/// Lines look like `[0:00] some text` or plain text.
fn text_part(line: &str) -> &str {
    line.find("] ").map(|i| &line[i + 2..]).unwrap_or(line)
}

/// Whisper noise tokens that are NEVER legitimate transcript content.
/// These can be trimmed at any count and should be skipped by the dedup pass
/// so the dedicated noise handlers (`collapse_noise_markers`, `trim_trailing_noise`)
/// can deal with them.
fn is_always_noise(text: &str) -> bool {
    let t = text.trim().to_lowercase();
    t == "[music]" || t == "[blank_audio]" || t == "[silence]" || t == "music"
}

/// Whisper non-speech tokens that legitimately appear as `(crying)` /
/// `[laughter]` style annotations. Compared case-insensitively against each
/// whitespace-separated word inside the parentheses or brackets.
///
/// Keeping this an explicit allowlist (rather than "any short parenthetical")
/// is what stops the all-noise classifier from eating legitimate user
/// parentheticals like `(see attached)`, `(part 1)`, `(2 of 3)`, or
/// `(continued)` that whisper would never emit on its own.
///
/// Includes a handful of foreign-language whisper-emitted tokens (Polish,
/// Spanish, German, French) so non-English captures still get the bracketed
/// form recognized.
const NOISE_WORDS: &[&str] = &[
    // English non-speech events whisper labels on near-silent / noisy audio
    "crying",
    "laughter",
    "laughing",
    "applause",
    "growling",
    "music",
    "sobbing",
    "cheering",
    "sighing",
    "clapping",
    "coughing",
    "sneezing",
    "gasping",
    "whispering",
    "mumbling",
    "humming",
    "breathing",
    "silence",
    "snoring",
    "yelling",
    "screaming",
    // Whisper-specific synthetic tokens (typically bracketed)
    "blank_audio",
    "inaudible",
    "noise",
    "crosstalk",
    "typing",
    "static",
    "beep",
    "ringing",
    // Non-English whisper noise tokens we've seen in real captures
    "śmiech",    // Polish: laughter
    "risas",     // Spanish: laughter
    "musik",     // German: music
    "musique",   // French: music
    "musica",    // Italian/Spanish: music
    "música",    // Spanish/Portuguese: music
    "muzyka",    // Polish: music
    "applaus",   // German: applause
    "aplausos",  // Spanish: applause
    "applausi",  // Italian: applause
    "oklaski",   // Polish: applause
    "ruido",     // Spanish: noise
    "geräusch",  // German: noise
    "stille",    // German: silence
    "silencio",  // Spanish: silence
    "cisza",     // Polish: silence
    "rires",     // French: laughter
    "rire",      // French: laughter (singular form)
    "gelächter", // German: laughter
    "weeping",   // English variant of crying
];

/// High-confidence Whisper hallucination signatures.
///
/// Whisper, fed long stretches of near-silent audio, emits stable
/// "training-data leak" phrases from YouTube subtitles, Amara.org community
/// credits, and similar sources. These survive `dedup_interleaved` because
/// each occurrence is a different exact string (sometimes pluralized,
/// punctuated, or wrapped in different filler), and they survive
/// `collapse_noise_markers` because they are not bracketed/parenthetical
/// tokens. They look like normal sentences but are confidently identifiable
/// from a small list of recurring surface forms.
///
/// Phrases are normalized lowercase, with trailing punctuation stripped,
/// and matched exactly against the line's text content (after stripping any
/// `[h:mm:ss]` timestamp + optional speaker prefix). Substring matching
/// would risk false positives in real speech (e.g. "thank you for watching
/// the demo carefully" should NOT be dropped just because it contains
/// "thank you for watching").
///
/// Conservative bar: a phrase is added here only when it is virtually
/// impossible for a real human speaker to utter it as a complete sentence
/// in a meeting transcript. URL-style hallucinations are handled separately
/// in [`is_url_line`].
const KNOWN_HALLUCINATION_PHRASES: &[&str] = &[
    // YouTube subtitle openers/closers
    "thank you for watching",
    "thanks for watching",
    "thank you for watching!",
    "thank you so much for watching",
    "please subscribe to our channel",
    "please subscribe",
    "please like and subscribe",
    "like and subscribe",
    "smash that like button",
    "don't forget to subscribe",
    "see you in the next video",
    "see you next time",
    // Amara.org community subtitle hallucinations
    "subtitles by the amara.org community",
    "transcribed by the amara.org community",
    "translated by the amara.org community",
    "the amara.org community",
    "amara.org community",
    // Generic transcription-service hallucinations (bare forms; for prefix
    // matches with trailing content like `Transcripted by: www.amara.org`
    // see [`HALLUCINATION_LINE_PREFIXES`])
    "captions by the cyclope",
];

/// Hallucination phrases that whisper emits with trailing content, e.g.
/// `Transcripted by: www.transcription-exe-project.com` or
/// `Subtitles by the Amara.org community`. Matched as a **starts-with**
/// prefix against the normalized line text rather than exact-match, so
/// they catch both the bare form and the variants with trailing URLs,
/// service names, or fillers.
///
/// Conservative bar: only credit-attribution prefixes that a real meeting
/// participant is virtually guaranteed not to utter as the start of a
/// sentence. Niche edge cases (e.g. someone reading a meeting transcript
/// out loud that contains a `Transcribed by` credit line) accept this as
/// noise rather than complicate the detector.
const HALLUCINATION_LINE_PREFIXES: &[&str] = &[
    "transcripted by",
    "transcribed by",
    "captions by",
    "captioned by",
    "subtitles by",
    "translated by",
];

/// Check if a line is just a URL or domain reference, common in Whisper
/// long-tail hallucinations like `Transcripted by: www.transcription-...`.
///
/// Conservative match: requires the line content to start with `www.` or
/// `http://` / `https://`, with at most one trailing word of context. Drops
/// pure URL hallucinations without affecting real speech that happens to
/// mention a URL inline.
fn is_url_line(text: &str) -> bool {
    let trimmed = text.trim();
    if trimmed.is_empty() {
        return false;
    }
    // Require URL-ish prefix on the first whitespace-separated token.
    let first = trimmed.split_whitespace().next().unwrap_or("");
    first.starts_with("www.") || first.starts_with("http://") || first.starts_with("https://")
}

/// Test whether a transcript line's content matches a known Whisper
/// hallucination phrase (after timestamp/speaker prefix stripping and case
/// normalization). Used by [`strip_known_hallucinations`].
fn is_known_hallucination(text: &str) -> bool {
    let lowered = text.to_lowercase();
    let normalized = lowered
        .trim()
        .trim_end_matches(['.', '!', '?', ',', ';', ':'])
        .trim();
    if normalized.is_empty() {
        return false;
    }
    if KNOWN_HALLUCINATION_PHRASES.contains(&normalized) {
        return true;
    }
    if HALLUCINATION_LINE_PREFIXES
        .iter()
        .any(|p| normalized.starts_with(p))
    {
        return true;
    }
    is_url_line(normalized)
}

/// Case-insensitive membership check against [`NOISE_WORDS`].
///
/// The lookup table is small (under 50 entries) so a linear scan with
/// `eq_ignore_ascii_case` is fine; the non-ASCII Polish / Spanish forms are
/// matched verbatim after lowercasing the input. We lowercase here rather
/// than at insertion so the allowlist stays readable.
fn is_noise_word(word: &str) -> bool {
    let lower = word.to_lowercase();
    NOISE_WORDS.iter().any(|w| *w == lower)
}

/// Return true if the text (after timestamp) is a short non-speech marker.
///
/// Matches two whisper hallucination shapes:
/// - bracketed: `[music]`, `[Śmiech]`, `[BLANK_AUDIO]`, `[risas]`, `[Growling]`
/// - parenthetical: `(crying)`, `(coughing)`, `(applause)`, `(silence)`,
///   `(soft music)`, `(loud applause)`
///
/// Excludes timestamp-like content `[0:00]` and collapse markers from prior
/// dedup passes `[...] [repeated ...]`. Word-count (1-4 inner words) and
/// length (≤40 inner chars) constraints keep this conservative.
///
/// **Allowlist gate:** at least one whitespace-separated word inside the
/// delimiters must match [`NOISE_WORDS`] (case-insensitive). Without this,
/// short user-authored parentheticals like `(see attached)`, `(part 1)`,
/// `(2 of 3)`, or `(continued)` would be misclassified as whisper
/// hallucinations and dropped from the transcript.
///
/// This is the shared classifier used by both [`collapse_noise_markers`] and
/// the [`is_all_noise`] read-only signal.
pub fn is_noise_marker(text: &str) -> bool {
    let t = text.trim();
    if t.is_empty() {
        return false;
    }
    // Collapse markers from prior passes are not noise
    if t.starts_with("[...]") {
        return false;
    }
    // Optionally strip a single trailing '.' (whisper sometimes adds one)
    let t = t.strip_suffix('.').unwrap_or(t);

    // Accept either bracketed `[...]` or parenthetical `(...)` shape; both
    // collapse to the same inner substring after the outer delimiters.
    let matched =
        (t.starts_with('[') && t.ends_with(']')) || (t.starts_with('(') && t.ends_with(')'));
    if !matched {
        return false;
    }
    let inner = &t[1..t.len() - 1];

    // Reject timestamp-like patterns (digits and colons only)
    if inner.chars().all(|c| c.is_ascii_digit() || c == ':') {
        return false;
    }
    // Must be short (1-4 words, ≤40 chars) - non-speech markers are brief
    let word_count = inner.split_whitespace().count();
    if !(1..=4).contains(&word_count) || inner.len() > 40 {
        return false;
    }

    // Allowlist gate: the LAST whitespace-separated word inside the
    // delimiters must be a known whisper non-speech token. This dominance
    // rule keeps legitimate phrasings like `(music director)` or
    // `(applause sounds great)` from matching just because they happen to
    // contain a noise word, while still recognizing whisper's compound
    // emissions like `(soft music)` / `(loud applause)` / `(audience
    // laughter)` where the noise word terminates the phrase.
    inner
        .split_whitespace()
        .next_back()
        .is_some_and(is_noise_word)
}

/// Return true iff every non-empty line in `lines` is a noise marker (after
/// stripping any leading `[h:mm:ss]`-style timestamp).
///
/// Empty lines are ignored. An input with zero non-empty lines returns `false`
/// (there's nothing to call "all noise").
///
/// This is a read-only signal: it does not alter `lines`. Callers (notably
/// `minutes-core`) use it together with separate capture-health signals (e.g.
/// silent / sparse stems) to decide whether to suppress a transcript body that
/// is almost certainly fabricated.
pub fn is_all_noise(lines: &[String]) -> bool {
    let mut saw_any = false;
    for line in lines {
        let text = text_part(line).trim();
        if text.is_empty() {
            continue;
        }
        saw_any = true;
        if !is_noise_marker(text) {
            return false;
        }
    }
    saw_any
}

/// Statistics from transcript cleaning.
///
/// Each `after_*` field records the segment count *after* that pass ran. If a pass
/// is disabled in [`CleanOptions`], its field carries the count from the previous
/// (enabled) pass - making it safe to compute pass-level deltas like
/// `stats.original_lines - stats.after_consecutive_dedup` without checking which
/// passes ran.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub struct CleanStats {
    pub original_lines: usize,
    pub after_consecutive_dedup: usize,
    pub after_interleaved_dedup: usize,
    pub after_script_filter: usize,
    pub after_noise_markers: usize,
    pub after_trailing_trim: usize,
    pub after_command_strip: usize,
    /// Segment count after [`strip_known_hallucinations`] ran. If the pass
    /// is disabled in [`CleanOptions`], this carries the count from the
    /// previous (enabled) pass. Tracks the long-tail-of-silence Whisper
    /// hallucination removal (#242).
    pub after_hallucination_strip: usize,
    /// **Net** segment count delta (`original_lines - after_noise_markers`,
    /// where `after_noise_markers` is the final output count post-pipeline),
    /// not the raw count of input lines that were dropped.
    ///
    /// The dedup pass inserts a single `[...] [repeated audio removed]`
    /// annotation line in place of each collapsed run, so collapsing 5
    /// inputs to 1 occurrence + 1 annotation produces a net change of `-3`.
    /// To get the cleaner "input minus output" count, suppress the annotations
    /// via [`CleanOptions::keep_dedup_annotations`] = `false`.
    pub lines_removed: usize,
    /// `true` iff every non-empty line in the cleaned output is a noise
    /// marker (bracketed `[music]` / `[Growling]` or parenthetical
    /// `(crying)` / `(applause)`), and there is at least one such line.
    ///
    /// This is a **read-only signal** computed after the cleanup passes run.
    /// It does not change the cleanup output itself; downstream callers can
    /// use it (together with separate capture-health signals like sparse or
    /// silent stems) to decide whether to suppress a transcript body that is
    /// almost certainly fabricated on near-silent audio.
    ///
    /// Examples:
    /// - `["[0:07] (crying)", "[1:52] [Growling]"]` → `true`
    /// - `["[0:00] Hello world", "[0:03] [laughter]"]` → `false`
    /// - `[]` → `false` (nothing to call all-noise)
    pub all_noise: bool,
}

impl CleanStats {
    /// Compact one-line summary for logging.
    ///
    /// ```
    /// use whisper_guard::segments::{clean_segments, CleanStats};
    ///
    /// let (_, stats) = clean_segments(&[
    ///     "Thank you.".into(),
    ///     "Thank you.".into(),
    ///     "Thank you.".into(),
    ///     "Real content here.".into(),
    /// ]);
    /// assert!(stats.summary().starts_with("whisper-guard:"));
    /// ```
    pub fn summary(&self) -> String {
        // `after_noise_markers` is the final segment count post-pipeline
        // (collapse_noise_markers runs last; see clean_segments_with_options).
        format!(
            "whisper-guard: {} → {} segments ({} removed)",
            self.original_lines, self.after_noise_markers, self.lines_removed,
        )
    }
}

/// Toggles for each cleaning pass.
///
/// All passes default to enabled - `CleanOptions::default()` matches the production
/// configuration used by [Minutes](https://github.com/silverstein/minutes).
/// Use [`CleanOptions::none`] as a starting point if you want to enable only specific passes.
///
/// Passes always run in this order (fixed; the order matters for correctness):
///
/// 1. `strip_known_hallucinations` - drop lines matching known whisper
///    training-data leak phrases (YouTube/Amara/etc.) BEFORE dedup gets a
///    chance to turn them into `[...] [repeated audio removed]` annotations.
///    See [`strip_known_hallucinations`] for the conservative matching rule.
/// 2. `dedup_consecutive` - collapse runs of repeated real-content segments.
///    Always-noise tokens (`[music]`, `[blank_audio]`, `[silence]`, `music`) are
///    skipped here so the noise-aware passes downstream can handle them.
/// 3. `dedup_interleaved` - collapse A/B/A/B hallucination patterns
/// 4. `strip_foreign_script` - drop segments in unrelated writing systems
/// 5. `strip_trailing_commands` - strip `stop recording`-style voice commands.
///    Runs BEFORE trim so noise markers hidden behind a trailing command get
///    exposed to the trim pass.
/// 6. `trim_trailing_noise` - trim noise markers off the end. Always-noise
///    tokens get trimmed at any count; filler words (`yeah.`, `okay.`, `you`)
///    need a 5+ run to trigger.
/// 7. `collapse_noise_markers` - collapse middle-of-transcript `[music]`/
///    `[Śmiech]`/etc. runs. Runs LAST so trim has first crack at trailing
///    noise; whatever survives in the middle gets collapsed cleanly.
///
/// ```
/// use whisper_guard::segments::{clean_segments_with_options, CleanOptions};
///
/// // Only run the two dedup passes; leave foreign script and noise markers alone.
/// let opts = CleanOptions {
///     dedup_consecutive: true,
///     dedup_interleaved: true,
///     ..CleanOptions::none()
/// };
///
/// let (cleaned, stats) = clean_segments_with_options(
///     &["Hello.".into(), "Hello.".into(), "Hello.".into(), "World.".into()],
///     &opts,
/// );
/// // 3 "Hello." + 1 "World." → "Hello." + dedup-annotation + "World."
/// assert_eq!(cleaned.len(), 3);
/// assert!(cleaned.iter().any(|s| s.contains("World")));
/// assert!(stats.lines_removed >= 1);
/// ```
// NOTE: deliberately NOT `#[non_exhaustive]`. Functional record update
// (`..CleanOptions::default()`) is the primary ergonomic pattern for this struct,
// and `#[non_exhaustive]` blocks it from external crates. New fields will be
// added as minor-version bumps with a CHANGELOG entry.
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct CleanOptions {
    pub dedup_consecutive: bool,
    pub dedup_interleaved: bool,
    pub strip_foreign_script: bool,
    pub collapse_noise_markers: bool,
    pub trim_trailing_noise: bool,
    pub strip_trailing_commands: bool,
    /// Drop lines matching known whisper hallucination phrases
    /// (YouTube/Amara/transcription-service training-data leaks). See
    /// [`strip_known_hallucinations`] for the full rationale and conservative
    /// matching rule. Catches the long-tail-of-silence failure mode (#242).
    pub strip_known_hallucinations: bool,
    /// Keep the `[...] [repeated audio removed - N identical segments collapsed]`
    /// annotation lines that the consecutive-dedup pass inserts.
    ///
    /// `true` (default) preserves them as a human-readable trail of what was
    /// stripped. Set to `false` for a cleaner output stream - useful when the
    /// segments are about to be fed to an LLM, joined into a flat string, or
    /// otherwise consumed by code rather than read by a human.
    pub keep_dedup_annotations: bool,
}

impl Default for CleanOptions {
    /// All passes enabled and dedup annotations preserved.
    /// Matches the production tuning used by Minutes.
    fn default() -> Self {
        Self {
            dedup_consecutive: true,
            dedup_interleaved: true,
            strip_foreign_script: true,
            collapse_noise_markers: true,
            trim_trailing_noise: true,
            strip_trailing_commands: true,
            strip_known_hallucinations: true,
            keep_dedup_annotations: true,
        }
    }
}

impl CleanOptions {
    /// All passes enabled. Equivalent to `CleanOptions::default()`.
    pub fn all() -> Self {
        Self::default()
    }

    /// All passes disabled. Useful as a base when you want to enable specific
    /// passes from scratch via the `..CleanOptions::none()` shorthand.
    ///
    /// Note: `keep_dedup_annotations` stays `true` here - it controls how dedup
    /// emits its output, not whether dedup runs. If you opt back into
    /// `dedup_consecutive`, you get the same annotation-emitting behavior as
    /// the default config; suppress annotations explicitly with
    /// `keep_dedup_annotations: false` if you don't want them.
    pub fn none() -> Self {
        Self {
            dedup_consecutive: false,
            dedup_interleaved: false,
            strip_foreign_script: false,
            collapse_noise_markers: false,
            trim_trailing_noise: false,
            strip_trailing_commands: false,
            strip_known_hallucinations: false,
            keep_dedup_annotations: true,
        }
    }
}

/// Prefix used to identify dedup annotation lines so they can be filtered.
const DEDUP_ANNOTATION_PREFIX: &str = "[...] [repeated audio removed";

/// Clean a list of raw transcript segments.
///
/// **This is the entry point if you're calling whisper-rs directly** (or any other
/// transcription engine that hands you `Vec<String>` segments). It runs every
/// hallucination guard with default settings and returns the cleaned segments plus
/// statistics about what was removed.
///
/// Idempotent: running it twice produces the same output.
///
/// # When to use this vs. [`clean_transcript`]
///
/// - Use [`clean_segments`] if you have raw segment text (the common case for
///   `whisper_state.get_segment(i).to_str()` callers).
/// - Use [`clean_transcript`] if you have a single string with timestamped lines
///   like `[0:00] hello world`.
///
/// # Example: cleaning whisper-rs output
///
/// ```
/// use whisper_guard::segments::clean_segments;
///
/// // Whisper hallucination pattern: same phrase repeated on silence
/// let raw = vec![
///     "Thank you.".to_string(),
///     "Thank you.".to_string(),
///     "Thank you.".to_string(),
///     "Thank you.".to_string(),
///     "What's the budget for this quarter?".to_string(),
/// ];
///
/// let (cleaned, stats) = clean_segments(&raw);
///
/// // Consecutive dedup keeps the first occurrence + an annotation line
/// // showing what was removed, so 4 repeats collapse to 2 segments.
/// assert!(stats.lines_removed >= 2);
/// assert!(cleaned.iter().any(|s| s.contains("budget")));
/// ```
pub fn clean_segments(segments: &[String]) -> (Vec<String>, CleanStats) {
    clean_segments_with_options(segments, &CleanOptions::default())
}

/// Clean a list of raw transcript segments with caller-controlled passes.
///
/// Like [`clean_segments`], but lets you disable specific passes if they cause
/// false positives in your pipeline. Pass order is fixed - see [`CleanOptions`]
/// for the rationale.
///
/// # Example: opt out of foreign-script filtering for multilingual transcripts
///
/// ```
/// use whisper_guard::segments::{clean_segments_with_options, CleanOptions};
///
/// let opts = CleanOptions {
///     strip_foreign_script: false,  // we expect mixed scripts
///     ..CleanOptions::default()
/// };
///
/// let segments = vec![
///     "Hello world".to_string(),
///     "你好世界".to_string(),  // would normally be filtered as foreign script
/// ];
/// let (cleaned, _stats) = clean_segments_with_options(&segments, &opts);
/// assert_eq!(cleaned.len(), 2);
/// ```
pub fn clean_segments_with_options(
    segments: &[String],
    opts: &CleanOptions,
) -> (Vec<String>, CleanStats) {
    let original_count = segments.len();
    let mut lines: Vec<String> = segments.to_vec();

    // Run BEFORE dedup_consecutive so repeated hallucination phrases are
    // dropped on identity rather than collapsed into a `[...] [repeated
    // audio removed]` annotation (which would mark hallucination noise as
    // worth-noting summarized content). The dedup_interleaved pass below
    // still handles other repetition shapes; this pass only catches the
    // exact-match training-data leak surface forms.
    if opts.strip_known_hallucinations {
        lines = strip_known_hallucinations(&lines);
    }
    let after_hallucination = lines.len();

    if opts.dedup_consecutive {
        lines = dedup_segments(&lines);
        if !opts.keep_dedup_annotations {
            lines.retain(|s| !s.starts_with(DEDUP_ANNOTATION_PREFIX));
        }
    }
    let after_consecutive = lines.len();

    if opts.dedup_interleaved {
        lines = dedup_interleaved(&lines);
    }
    let after_interleaved = lines.len();

    if opts.strip_foreign_script {
        lines = strip_foreign_script(&lines);
    }
    let after_script = lines.len();

    // Pipeline ordering rationale (matters for correctness):
    //
    //   1. strip_known_hallucinations (already ran above) - drop known
    //      whisper training-data leaks BEFORE dedup gets a chance to
    //      collapse them into `[...] [repeated audio removed]` annotations.
    //   2. dedup_consecutive (already ran above) - skips always-noise tokens
    //      so they flow downstream as a run for the noise-aware passes.
    //   3. dedup_interleaved (already ran above).
    //   4. strip_foreign_script (already ran above).
    //   5. strip_trailing_commands ← here. Runs BEFORE trim so that any
    //      always-noise markers hidden behind a trailing voice command
    //      (e.g. "…content [music] [music] Stop recording.") are exposed.
    //   6. trim_trailing_noise ← here. Catches all-noise tails at any count
    //      (always-noise tokens) and 5+ filler runs (`yeah.`, `okay.`, `you`).
    //   7. collapse_noise_markers ← runs LAST, so middle-of-transcript noise
    //      runs that survived trim get collapsed cleanly. If this ran earlier
    //      it would convert trailing `[music]` runs into `[music] + annotation`
    //      and trim would be blocked from cleaning them up.
    //
    // Each `after_X` stat reports the count after pass X ran, regardless of
    // chronological position in the pipeline. So `after_command_strip <=
    // after_trailing_trim` and `after_noise_markers >= after_trailing_trim`
    // are no longer guaranteed - the field name maps to a pass, not an
    // ordinal position.
    if opts.strip_trailing_commands {
        lines = strip_trailing_commands(&lines);
    }
    let after_command = lines.len();

    if opts.trim_trailing_noise {
        lines = trim_trailing_noise(&lines);
    }
    let after_trim = lines.len();

    if opts.collapse_noise_markers {
        lines = collapse_noise_markers(&lines);
    }
    let after_noise = lines.len();

    let stats = CleanStats {
        original_lines: original_count,
        after_consecutive_dedup: after_consecutive,
        after_interleaved_dedup: after_interleaved,
        after_script_filter: after_script,
        after_noise_markers: after_noise,
        after_trailing_trim: after_trim,
        after_command_strip: after_command,
        after_hallucination_strip: after_hallucination,
        // Net change from input to final output. `collapse_noise_markers`
        // runs last (per the pipeline-order rationale above), so
        // `after_noise_markers` is the final segment count.
        lines_removed: original_count.saturating_sub(after_noise),
        // Read-only signal: every non-empty surviving line is a noise marker.
        // Cleanup output is unchanged - downstream callers decide what to do.
        all_noise: is_all_noise(&lines),
    };

    (lines, stats)
}

/// Clean an existing transcript by running all post-processing dedup layers.
///
/// **Use this if your transcript is already a single string with timestamped lines**
/// like `[0:00] some text`. For raw segments straight from whisper, prefer
/// [`clean_segments`] - it skips the unnecessary parse/format round-trip.
///
/// Idempotent: running it on already-cleaned text produces the same output.
///
/// # Example
///
/// ```
/// use whisper_guard::segments::clean_transcript;
///
/// let raw = "[0:00] Hello world\n[0:03] Hello world\n[0:06] Hello world\n[0:09] Real content";
/// let (cleaned, stats) = clean_transcript(raw);
/// assert!(stats.lines_removed > 0);
/// assert!(cleaned.contains("Real content"));
/// ```
pub fn clean_transcript(transcript: &str) -> (String, CleanStats) {
    let lines: Vec<String> = transcript.lines().map(|l| l.to_string()).collect();
    let (cleaned, stats) = clean_segments(&lines);
    (cleaned.join("\n"), stats)
}

/// Detect and remove repetition loops from whisper output.
///
/// Whisper's decoder can get stuck repeating the same text across consecutive segments,
/// especially on non-English audio. This function detects runs of 3+ consecutive segments
/// with >80% text overlap and collapses them to the first occurrence.
pub fn dedup_segments(lines: &[String]) -> Vec<String> {
    if lines.len() < 3 {
        return lines.to_vec();
    }

    // Simple text similarity: ratio of matching chars to total chars (normalized)
    fn similarity(a: &str, b: &str) -> f64 {
        if a.is_empty() || b.is_empty() {
            return 0.0;
        }
        let a_lower = a.to_lowercase();
        let b_lower = b.to_lowercase();
        if a_lower == b_lower {
            return 1.0;
        }
        // Use longest common substring ratio as a fast similarity measure
        let (short, long) = if a_lower.len() <= b_lower.len() {
            (&a_lower, &b_lower)
        } else {
            (&b_lower, &a_lower)
        };
        if long.contains(short.as_str()) {
            return short.len() as f64 / long.len() as f64;
        }
        // Count matching words as fallback
        let a_words: Vec<&str> = a_lower.split_whitespace().collect();
        let b_words: Vec<&str> = b_lower.split_whitespace().collect();
        let matching = a_words.iter().filter(|w| b_words.contains(w)).count();
        let total = a_words.len().max(b_words.len());
        if total == 0 {
            return 0.0;
        }
        matching as f64 / total as f64
    }

    let mut result = Vec::with_capacity(lines.len());
    let mut i = 0;

    while i < lines.len() {
        let base_text = text_part(&lines[i]);

        // Always-noise tokens are NOT collapsed by dedup - they're handed to
        // collapse_noise_markers / trim_trailing_noise which know how to treat
        // them as a class. Collapsing here would prematurely turn a noise run
        // into "marker + annotation", which then can't be trimmed even if it's
        // entirely trailing.
        if is_always_noise(base_text) {
            result.push(lines[i].clone());
            i += 1;
            continue;
        }

        let mut run_end = i + 1;

        while run_end < lines.len() {
            let candidate = text_part(&lines[run_end]);
            if similarity(base_text, candidate) >= 0.8 {
                run_end += 1;
            } else {
                break;
            }
        }

        let run_len = run_end - i;

        if run_len >= 3 {
            tracing::debug!(
                first_segment = i,
                repeated_count = run_len,
                text = base_text,
                "detected repetition loop in whisper output - collapsing {} segments",
                run_len
            );
            result.push(lines[i].clone());
            result.push(format!(
                "{} - {} identical segments collapsed]",
                DEDUP_ANNOTATION_PREFIX,
                run_len - 1
            ));
            i = run_end;
        } else {
            result.push(lines[i].clone());
            i += 1;
        }
    }

    result
}

/// Detect interleaved repetition patterns that escape consecutive dedup.
///
/// Whisper often hallucinates alternating patterns like:
///   "So I'm going to pick his brain" / "Okay." / "So I'm going to pick his brain" / "Okay."
/// or inserts short filler between repeated phrases. The consecutive dedup misses these
/// because no two adjacent lines are similar.
///
/// Strategy: use a sliding window to detect when a single phrase dominates a region.
/// If any phrase appears in >=50% of lines within a 10-line window, and the window
/// contains at least 5 such occurrences, collapse the entire dominated region.
pub fn dedup_interleaved(lines: &[String]) -> Vec<String> {
    if lines.len() < 6 {
        return lines.to_vec();
    }

    fn normalize(text: &str) -> String {
        text.to_lowercase()
            .chars()
            .filter(|c| c.is_alphanumeric() || c.is_whitespace())
            .collect::<String>()
            .split_whitespace()
            .collect::<Vec<_>>()
            .join(" ")
    }

    /// Short filler phrases that whisper inserts between hallucinated repetitions.
    fn is_filler(text: &str) -> bool {
        let normalized = text.trim().to_lowercase();
        let normalized = normalized.trim_matches(|c: char| !c.is_alphanumeric());
        matches!(
            normalized,
            "okay"
                | "ok"
                | "yeah"
                | "yes"
                | "right"
                | "so"
                | "and"
                | "but"
                | "well"
                | "uh"
                | "um"
                | "hmm"
                | "mhm"
        )
    }

    // Build normalized text for each line
    let texts: Vec<String> = lines.iter().map(|l| normalize(text_part(l))).collect();
    let fillers: Vec<bool> = texts.iter().map(|t| is_filler(t)).collect();

    // Mark lines that are part of a hallucination region.
    let mut remove = vec![false; lines.len()];

    let window_size = 10;
    let min_occurrences = 5;

    let mut i = 0;
    while i + window_size <= lines.len() {
        // Count phrase frequencies in this window (excluding fillers)
        let mut freq: std::collections::BTreeMap<&str, Vec<usize>> =
            std::collections::BTreeMap::new();
        for j in i..i + window_size {
            if !fillers[j] && !texts[j].is_empty() {
                freq.entry(&texts[j]).or_default().push(j);
            }
        }

        // Find the dominant phrase (BTreeMap for deterministic iteration order)
        let dominant = freq
            .iter()
            .max_by(|(phrase_a, pos_a), (phrase_b, pos_b)| {
                pos_a
                    .len()
                    .cmp(&pos_b.len())
                    .then_with(|| phrase_a.cmp(phrase_b))
            })
            .filter(|(_, positions)| positions.len() >= min_occurrences);

        if let Some((phrase, _)) = dominant {
            let phrase = phrase.to_string();
            // Extend the region: keep scanning forward while the phrase keeps appearing
            let mut region_end = i + window_size;
            while region_end < lines.len() {
                let t = &texts[region_end];
                if *t == phrase || fillers[region_end] {
                    region_end += 1;
                } else {
                    let mut gap = 0;
                    let mut found_resume = false;
                    for t in texts
                        .iter()
                        .take(lines.len().min(region_end + 3))
                        .skip(region_end)
                    {
                        if *t == phrase {
                            found_resume = true;
                            break;
                        }
                        gap += 1;
                    }
                    if found_resume && gap <= 2 {
                        region_end += gap + 1;
                    } else {
                        break;
                    }
                }
            }

            let region_len = region_end - i;
            let actual_count = (i..region_end).filter(|&j| texts[j] == phrase).count();

            if actual_count >= min_occurrences && region_len >= 6 {
                tracing::debug!(
                    region_start = i,
                    region_end = region_end,
                    occurrences = actual_count,
                    filler_count = (i..region_end).filter(|&j| fillers[j]).count(),
                    phrase = phrase,
                    "detected interleaved hallucination loop - marking {} lines for removal",
                    region_len
                );
                let mut kept_first = false;
                for j in i..region_end {
                    if !kept_first && texts[j] == phrase {
                        kept_first = true;
                    } else {
                        remove[j] = true;
                    }
                }
                i = region_end;
                continue;
            }
        }

        i += 1;
    }

    let removed_count = remove.iter().filter(|&&r| r).count();
    if removed_count > 0 {
        let mut result = Vec::with_capacity(lines.len() - removed_count + 1);
        let mut in_removed_run = false;

        for (idx, line) in lines.iter().enumerate() {
            if remove[idx] {
                if !in_removed_run {
                    in_removed_run = true;
                    let run_len = (idx..lines.len()).take_while(|&j| remove[j]).count();
                    result.push(format!(
                        "[...] [hallucinated repetition removed - {} lines collapsed]",
                        run_len
                    ));
                }
            } else {
                in_removed_run = false;
                result.push(line.clone());
            }
        }

        tracing::info!(
            original = lines.len(),
            removed = removed_count,
            remaining = result.len(),
            "interleaved dedup complete"
        );
        result
    } else {
        lines.to_vec()
    }
}

/// Collapse runs of bracketed non-speech markers in any language.
///
/// Whisper emits non-speech audio events as bracketed text: `[music]`, `[laughter]`,
/// `[applause]`, `[BLANK_AUDIO]`, etc. In non-English audio these appear in the
/// source language: `[Śmiech]` (Polish laughter), `[Musik]` (German music),
/// `[risas]` (Spanish laughter), etc.
///
/// The existing `trim_trailing_noise` only catches trailing English markers. This
/// function is language-agnostic - it detects any line whose text (after timestamp)
/// is a short bracketed expression `[word(s)]` and collapses consecutive runs of 3+.
/// It also collapses scattered patterns when >50% of a window are noise markers.
pub fn collapse_noise_markers(lines: &[String]) -> Vec<String> {
    if lines.len() < 3 {
        return lines.to_vec();
    }

    // Classifier lives at module scope (`is_noise_marker`) so the `all_noise`
    // signal in `CleanStats` can reuse it without duplicating the rules.

    let markers: Vec<bool> = lines
        .iter()
        .map(|l| is_noise_marker(text_part(l)))
        .collect();

    // Pass 1: Collapse consecutive runs of 3+ noise markers
    let mut result = Vec::with_capacity(lines.len());
    let mut i = 0;
    while i < lines.len() {
        if markers[i] {
            let run_start = i;
            while i < lines.len() && markers[i] {
                i += 1;
            }
            let run_len = i - run_start;
            if run_len >= 3 {
                result.push(lines[run_start].clone());
                result.push(format!(
                    "[...] [non-speech audio removed - {} markers collapsed]",
                    run_len - 1
                ));
                tracing::debug!(
                    run_start = run_start,
                    collapsed = run_len - 1,
                    sample = text_part(&lines[run_start]),
                    "collapsed consecutive noise markers"
                );
            } else {
                // Short run (1-2): keep as-is
                for line in lines.iter().take(i).skip(run_start) {
                    result.push(line.clone());
                }
            }
        } else {
            result.push(lines[i].clone());
            i += 1;
        }
    }

    // Pass 2: Ratio check - if ≥2/3 of remaining lines are noise markers, strip them all.
    // After pass 1 collapses consecutive runs, scattered markers that still dominate
    // the transcript are almost certainly hallucination. Real recordings rarely have
    // this density (e.g., a comedy show might have 30-40% [laughter] annotations, not 66%+).
    let remaining_markers = result
        .iter()
        .filter(|l| is_noise_marker(text_part(l)))
        .count();
    let content_lines = result.len().saturating_sub(remaining_markers);
    if remaining_markers > 0 && content_lines > 0 {
        let ratio = remaining_markers as f64 / result.len() as f64;
        if ratio >= 0.66 && remaining_markers >= 8 {
            tracing::info!(
                markers = remaining_markers,
                total = result.len(),
                ratio = format!("{:.0}%", ratio * 100.0),
                "high noise marker density - stripping scattered markers"
            );
            let mut stripped = Vec::with_capacity(content_lines + 1);
            let mut removed = 0usize;
            for line in &result {
                if is_noise_marker(text_part(line)) {
                    removed += 1;
                } else {
                    stripped.push(line.clone());
                }
            }
            stripped.push(format!(
                "[{} scattered non-speech markers removed]",
                removed
            ));
            return stripped;
        }
    }

    let removed = lines.len() - result.len();
    if removed > 0 {
        tracing::info!(
            original = lines.len(),
            removed = removed,
            "collapsed noise markers"
        );
    }

    result
}

/// Detect and remove lines with hallucinated foreign script.
///
/// When whisper processes silence or very low-signal audio, it often hallucinates
/// text in scripts unrelated to the actual audio - most commonly CJK characters
/// (Japanese/Chinese/Korean), Arabic, or Cyrillic in an otherwise Latin transcript.
///
/// This function determines the dominant script of the transcript and removes lines
/// that are primarily in a different script. It is conservative: it only acts when
/// there is a clear majority script (≥70% of lines) and only removes lines where
/// ≥50% of alphabetic characters are in a foreign script.
///
/// This is language-agnostic: a Japanese transcript with a few hallucinated Latin
/// lines would have the Latin lines removed, and vice versa. Also handles
/// Cyrillic, Arabic, and other scripts via the `Script::Other` bucket.
pub fn strip_foreign_script(lines: &[String]) -> Vec<String> {
    if lines.len() < 2 {
        return lines.to_vec();
    }

    // Classify each line's dominant script
    let classifications: Vec<Script> = lines
        .iter()
        .map(|l| classify_script(text_part(l)))
        .collect();

    // Count lines per script (ignoring Unknown/empty)
    let mut latin_count = 0usize;
    let mut cjk_count = 0usize;
    let mut other_count = 0usize;
    for s in &classifications {
        match s {
            Script::Latin => latin_count += 1,
            Script::Cjk => cjk_count += 1,
            Script::Other => other_count += 1,
            Script::Unknown => {}
        }
    }

    let meaningful = latin_count + cjk_count + other_count;
    if meaningful < 2 {
        return lines.to_vec();
    }

    // Determine majority script (must be ≥70% of meaningful lines)
    let majority = if latin_count as f64 / meaningful as f64 >= 0.7 {
        Script::Latin
    } else if cjk_count as f64 / meaningful as f64 >= 0.7 {
        Script::Cjk
    } else if other_count as f64 / meaningful as f64 >= 0.7 {
        Script::Other
    } else {
        return lines.to_vec(); // No clear majority - don't filter
    };

    let mut result = Vec::with_capacity(lines.len());
    let mut removed = 0usize;

    for (i, line) in lines.iter().enumerate() {
        let dominated_by_foreign = match (&classifications[i], &majority) {
            (Script::Unknown, _) => false, // Keep empty/punctuation-only lines
            (s, m) if s == m => false,     // Same script as majority
            _ => true,                     // Foreign script
        };

        if dominated_by_foreign {
            removed += 1;
        } else {
            result.push(line.clone());
        }
    }

    if removed > 0 {
        tracing::info!(
            removed = removed,
            majority = ?majority,
            "removed foreign-script hallucination lines"
        );
    }

    result
}

/// Drop transcript lines whose content matches a known Whisper hallucination
/// phrase ([`is_known_hallucination`]).
///
/// Catches the long-tail-of-silence failure mode (issue #242): on extended
/// near-silent audio, Whisper emits stable training-data leaks like
/// `Thank you for watching!`, `Subtitles by the Amara.org community`, and
/// `Transcripted by: www.transcription-...` URLs. These survive
/// `dedup_interleaved` (each line is a different exact string) and
/// `collapse_noise_markers` (they are not bracketed/parenthetical tokens).
///
/// Conservative: exact-match against the line's text content after
/// stripping the `[h:mm:ss]` timestamp and lowercasing. A line that
/// contains the phrase mid-sentence (e.g. `"thank you for watching the
/// demo carefully"`) is kept.
pub fn strip_known_hallucinations(lines: &[String]) -> Vec<String> {
    let mut result = Vec::with_capacity(lines.len());
    let mut removed = 0usize;
    for line in lines {
        if is_known_hallucination(text_part(line)) {
            removed += 1;
        } else {
            result.push(line.clone());
        }
    }
    if removed > 0 {
        tracing::info!(
            removed = removed,
            "removed known whisper-hallucination phrases"
        );
    }
    result
}

#[derive(Debug, Clone, PartialEq, Eq)]
enum Script {
    Latin,
    Cjk,
    Other,
    Unknown,
}

/// Classify the dominant script of a text string.
/// Returns the script that comprises the majority of alphabetic characters.
fn classify_script(text: &str) -> Script {
    let mut latin = 0u32;
    let mut cjk = 0u32;
    let mut other_script = 0u32;

    for ch in text.chars() {
        if !ch.is_alphabetic() {
            continue;
        }
        if ch.is_ascii_alphabetic()
            || ('\u{00C0}'..='\u{024F}').contains(&ch) // Latin Extended
            || ('\u{1E00}'..='\u{1EFF}').contains(&ch)
        {
            latin += 1;
        } else if ('\u{4E00}'..='\u{9FFF}').contains(&ch)   // CJK Unified
            || ('\u{3400}'..='\u{4DBF}').contains(&ch)       // CJK Extension A
            || ('\u{3040}'..='\u{309F}').contains(&ch)       // Hiragana
            || ('\u{30A0}'..='\u{30FF}').contains(&ch)       // Katakana
            || ('\u{AC00}'..='\u{D7AF}').contains(&ch)
        // Hangul
        {
            cjk += 1;
        } else {
            other_script += 1;
        }
    }

    let total = latin + cjk + other_script;
    if total == 0 {
        return Script::Unknown;
    }

    if latin as f64 / total as f64 >= 0.5 {
        Script::Latin
    } else if cjk as f64 / total as f64 >= 0.5 {
        Script::Cjk
    } else {
        Script::Other
    }
}

/// Trim trailing non-speech noise from the end of a transcript.
///
/// Recordings often capture music, silence, or ambient noise after the conversation
/// ends. Long runs of `[music]`, `[BLANK_AUDIO]`, or very short filler at the end
/// add no value and make the transcript look broken.
pub fn trim_trailing_noise(lines: &[String]) -> Vec<String> {
    if lines.is_empty() {
        return Vec::new();
    }

    /// Filler tokens that COULD be legitimate one-word closings ("Thanks.",
    /// "Yeah.", "Okay."). These need a higher floor to avoid trimming real
    /// terse content; only trim when there's a 5+ run of them.
    fn is_filler(text: &str) -> bool {
        let t = text.trim().to_lowercase();
        t == "you" || t == "okay." || t == "yeah."
        // Note: collapse markers ("[...] [repeated ...]") are NOT noise -
        // treating them as noise would make clean_transcript non-idempotent.
    }

    // Trim always-noise at any count, but do not let that decision pull a
    // preceding one-word closing ("Yeah.", "Okay.") into the removed suffix.
    let mut noise_trim_from = lines.len();
    let mut always_noise_count = 0usize;
    for i in (0..lines.len()).rev() {
        let text = text_part(&lines[i]);
        if is_always_noise(text) {
            noise_trim_from = i;
            always_noise_count += 1;
        } else {
            break;
        }
    }

    let mut filler_trim_from = noise_trim_from;
    let mut filler_count = 0usize;
    for i in (0..noise_trim_from).rev() {
        let text = text_part(&lines[i]);
        if is_filler(text) {
            filler_trim_from = i;
            filler_count += 1;
        } else {
            break;
        }
    }

    let trim_from = if filler_count >= 5 {
        filler_trim_from
    } else if always_noise_count > 0 {
        noise_trim_from
    } else {
        lines.len()
    };
    let trimmed_count = lines.len() - trim_from;

    if trimmed_count > 0 {
        tracing::info!(
            trimmed = trimmed_count,
            always_noise = always_noise_count,
            filler = filler_count,
            "removed trailing noise from transcript"
        );
        let mut result: Vec<String> = lines[..trim_from].to_vec();
        result.push(format!(
            "[Recording ended - {} lines of trailing noise removed]",
            trimmed_count
        ));
        result
    } else {
        lines.to_vec()
    }
}

/// Strip trailing voice command phrases that get captured by the mic.
///
/// Users commonly say "stop recording" or "end recording" out loud to signal
/// they're done. The microphone captures these phrases and Whisper transcribes
/// them as part of the meeting. This function removes them from the last 1-2
/// lines of the transcript.
pub fn strip_trailing_commands(lines: &[String]) -> Vec<String> {
    const COMMANDS: &[&str] = &[
        "stop recording",
        "stop the recording",
        "end recording",
        "end the recording",
        "stop transcription",
        "end transcription",
        "stop transcribing",
        "hey minutes stop",
        "minutes stop",
        "okay stop",
        "ok stop",
    ];

    let mut result = lines.to_vec();
    // Check last 2 lines - the command might be split across whisper segments
    for _ in 0..2 {
        if let Some(last) = result.last() {
            let text = text_part(last).trim().to_lowercase();
            let text = text.trim_end_matches('.');
            if COMMANDS
                .iter()
                .any(|cmd| text == *cmd || text.ends_with(cmd))
            {
                tracing::debug!(
                    line = result.last().map(|l| l.as_str()).unwrap_or(""),
                    "stripping trailing voice command"
                );
                result.pop();
            } else {
                break;
            }
        }
    }
    result
}

#[cfg(test)]
mod tests {
    use super::*;

    // ── clean_transcript end-to-end ──

    #[test]
    fn clean_transcript_removes_repetition() {
        let input = "[0:00] Hello world\n[0:03] Hello world\n[0:06] Hello world\n[0:09] Hello world\n[0:12] Something different\n";
        let (cleaned, stats) = clean_transcript(input);
        assert!(stats.lines_removed > 0);
        assert!(cleaned.contains("Something different"));
        assert!(cleaned.contains("repeated audio removed"));
    }

    #[test]
    fn clean_transcript_preserves_normal_text() {
        let input = "[0:00] First line\n[0:05] Second line\n[0:10] Third line\n";
        let (cleaned, stats) = clean_transcript(input);
        assert_eq!(stats.lines_removed, 0);
        assert!(cleaned.contains("First line"));
        assert!(cleaned.contains("Third line"));
    }

    // ── dedup_segments ──

    #[test]
    fn dedup_no_repetition() {
        let lines = vec![
            "[0:00] Hello world".into(),
            "[0:03] How are you".into(),
            "[0:06] Fine thanks".into(),
        ];
        let result = dedup_segments(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn dedup_collapses_exact_repetition() {
        let lines = vec![
            "[0:00] Hello world".into(),
            "[0:03] Hello world".into(),
            "[0:06] Hello world".into(),
            "[0:09] Hello world".into(),
            "[0:12] Something different".into(),
        ];
        let result = dedup_segments(&lines);
        assert_eq!(result.len(), 3);
        assert!(result[0].contains("Hello world"));
        assert!(result[1].contains("repeated audio removed"));
        assert!(result[2].contains("Something different"));
    }

    #[test]
    fn dedup_collapses_near_identical() {
        let lines = vec![
            "[0:00] Ok bene le macedi diesel".into(),
            "[0:03] Ok, bene le macedi diesel".into(),
            "[0:06] Ok bene, le macedi diesel".into(),
            "[0:09] Good morning".into(),
        ];
        let result = dedup_segments(&lines);
        assert_eq!(result.len(), 3);
        assert!(result[1].contains("repeated audio removed"));
    }

    #[test]
    fn dedup_leaves_two_similar_alone() {
        let lines = vec![
            "[0:00] Hello world".into(),
            "[0:03] Hello world".into(),
            "[0:06] Something else".into(),
        ];
        let result = dedup_segments(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn dedup_handles_empty() {
        let result = dedup_segments(&[]);
        assert!(result.is_empty());
    }

    #[test]
    fn dedup_handles_single_line() {
        let lines = vec!["[0:00] Hello".into()];
        let result = dedup_segments(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn dedup_multiple_runs() {
        let lines = vec![
            "[0:00] First phrase".into(),
            "[0:03] First phrase".into(),
            "[0:06] First phrase".into(),
            "[0:09] Second phrase".into(),
            "[0:12] Second phrase".into(),
            "[0:15] Second phrase".into(),
            "[0:18] Second phrase".into(),
            "[0:21] Normal text".into(),
        ];
        let result = dedup_segments(&lines);
        assert_eq!(result.len(), 5);
        assert!(result[1].contains("2 identical"));
        assert!(result[3].contains("3 identical"));
    }

    // ── interleaved dedup ──

    #[test]
    fn interleaved_catches_alternating_pattern() {
        let mut lines: Vec<String> = Vec::new();
        for i in 0..20 {
            let ts = i * 2;
            if i % 2 == 0 {
                lines.push(format!(
                    "[{}:{:02}] So I'm going to pick his brain as well.",
                    ts / 60,
                    ts % 60
                ));
            } else {
                lines.push(format!("[{}:{:02}] Okay.", ts / 60, ts % 60));
            }
        }
        lines.push("[0:40] Something completely different".into());

        let result = dedup_interleaved(&lines);
        assert!(
            result.len() <= 4,
            "expected <=4 lines, got {}: {:?}",
            result.len(),
            result
        );
        assert!(result.iter().any(|l| l.contains("pick his brain")));
        assert!(result
            .iter()
            .any(|l| l.contains("hallucinated repetition removed")));
        assert!(result
            .last()
            .unwrap()
            .contains("Something completely different"));
    }

    #[test]
    fn interleaved_leaves_normal_conversation() {
        let lines = vec![
            "[0:00] Hello how are you".into(),
            "[0:05] I'm fine thanks".into(),
            "[0:10] Great to hear".into(),
            "[0:15] Let's talk about the project".into(),
            "[0:20] Sure what's the update".into(),
            "[0:25] We shipped the feature".into(),
        ];
        let result = dedup_interleaved(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn interleaved_ignores_short_repeats() {
        let lines = vec![
            "[0:00] Hello world".into(),
            "[0:02] Okay.".into(),
            "[0:04] Hello world".into(),
            "[0:06] Okay.".into(),
            "[0:08] Hello world".into(),
            "[0:10] Something else".into(),
        ];
        let result = dedup_interleaved(&lines);
        assert_eq!(result, lines);
    }

    // ── trailing noise ──

    #[test]
    fn trim_trailing_music() {
        let mut lines: Vec<String> = vec![
            "[0:00] Hello world".into(),
            "[0:05] Some real content".into(),
        ];
        for i in 0..20 {
            lines.push(format!("[{}:00] [music]", i + 1));
        }
        let result = trim_trailing_noise(&lines);
        assert_eq!(result.len(), 3);
        assert!(result[0].contains("Hello world"));
        assert!(result[1].contains("real content"));
        assert!(result[2].contains("trailing noise removed"));
    }

    #[test]
    fn trim_short_run_of_always_noise_now_trimmed() {
        // 0.2.0 behavior change: always-noise tokens (`[music]`, `[blank_audio]`,
        // `[silence]`, `music`) are NEVER legitimate transcript content, so they
        // get trimmed at any count. The 5-line floor still protects filler words
        // (`you`, `okay.`, `yeah.`) that COULD be legitimate one-word closings -
        // see `trim_keeps_short_trailing_filler` below.
        let lines: Vec<String> = vec![
            "[0:00] Hello world".into(),
            "[0:05] [music]".into(),
            "[0:10] [music]".into(),
            "[0:15] [music]".into(),
        ];
        let result = trim_trailing_noise(&lines);
        assert_eq!(result.len(), 2);
        assert!(result[0].contains("Hello world"));
        assert!(result[1].contains("trailing noise removed"));
    }

    #[test]
    fn trim_keeps_short_trailing_filler() {
        // Filler words at the end MUST survive - common legitimate closing.
        let lines: Vec<String> = vec!["[0:00] That wraps it".into(), "[0:05] yeah.".into()];
        let result = trim_trailing_noise(&lines);
        assert_eq!(result, lines, "single-filler closing must survive");
    }

    #[test]
    fn trim_keeps_short_filler_before_trailing_noise() {
        // A real one-word closing should not be swept away just because the
        // recorder captured an unambiguous noise marker after it.
        let lines: Vec<String> = vec![
            "[0:00] That wraps it".into(),
            "[0:05] yeah.".into(),
            "[0:10] [music]".into(),
        ];
        let result = trim_trailing_noise(&lines);
        assert_eq!(result.len(), 3);
        assert!(result[0].contains("That wraps it"));
        assert!(result[1].contains("yeah."));
        assert!(result[2].contains("1 lines of trailing noise removed"));
        assert!(!result.iter().any(|line| line.contains("[music]")));
    }

    #[test]
    fn trim_long_run_of_filler_is_trimmed() {
        // 5+ filler in a row is suspicious enough to trim.
        let lines: Vec<String> = vec![
            "[0:00] Real content".into(),
            "[0:05] yeah.".into(),
            "[0:10] yeah.".into(),
            "[0:15] yeah.".into(),
            "[0:20] yeah.".into(),
            "[0:25] yeah.".into(),
        ];
        let result = trim_trailing_noise(&lines);
        assert_eq!(result.len(), 2);
        assert!(result[0].contains("Real content"));
        assert!(result[1].contains("trailing noise removed"));
    }

    #[test]
    fn trim_handles_empty() {
        assert!(trim_trailing_noise(&[]).is_empty());
    }

    #[test]
    fn trim_all_noise() {
        let lines: Vec<String> = (0..10).map(|i| format!("[{}:00] [music]", i)).collect();
        let result = trim_trailing_noise(&lines);
        assert_eq!(result.len(), 1);
        assert!(result[0].contains("trailing noise removed"));
    }

    // ── foreign script detection ──

    #[test]
    fn script_removes_cjk_from_latin_transcript() {
        let lines = vec![
            "[0:00] Hello and welcome".into(),
            "[0:05] Let's discuss the project".into(),
            "[0:10] スパイシー".into(),
            "[0:15] We should wrap up now".into(),
        ];
        let result = strip_foreign_script(&lines);
        assert_eq!(result.len(), 3);
        assert!(!result.iter().any(|l| l.contains("スパイシー")));
    }

    #[test]
    fn script_preserves_pure_latin_transcript() {
        let lines = vec![
            "[0:00] Hello world".into(),
            "[0:05] How are you".into(),
            "[0:10] I'm doing fine".into(),
        ];
        let result = strip_foreign_script(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn script_preserves_pure_cjk_transcript() {
        let lines = vec![
            "[0:00] こんにちは".into(),
            "[0:05] お元気ですか".into(),
            "[0:10] 元気です".into(),
        ];
        let result = strip_foreign_script(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn script_no_action_on_mixed_transcript() {
        // No clear majority (50/50 split) - don't filter anything
        let lines = vec![
            "[0:00] Hello world".into(),
            "[0:05] こんにちは".into(),
            "[0:10] Good morning".into(),
            "[0:15] お元気ですか".into(),
        ];
        let result = strip_foreign_script(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn script_handles_single_line() {
        let lines = vec!["[0:00] スパイシー".into()];
        let result = strip_foreign_script(&lines);
        assert_eq!(result, lines); // Single line - no majority to compare against
    }

    #[test]
    fn script_all_hallucinated_in_latin_majority() {
        // Mostly Latin with a couple CJK hallucination lines (>70% Latin)
        let lines = vec![
            "[0:00] Today we need to discuss".into(),
            "[0:05] The quarterly results".into(),
            "[0:10] Are looking good".into(),
            "[0:15] Revenue is up".into(),
            "[0:20] Margins improved significantly".into(),
            "[0:25] 東京タワー".into(),
            "[0:30] 大阪城".into(),
        ];
        let result = strip_foreign_script(&lines);
        assert_eq!(result.len(), 5);
        assert!(result
            .iter()
            .all(|l| !l.contains('東') && !l.contains('大')));
    }

    #[test]
    fn script_two_cjk_lines_preserved() {
        // Exactly 2 CJK lines: majority is CJK, so both are kept (not hallucination).
        let lines = vec!["[0:00] スパイシー".into(), "[0:05] 東京タワー".into()];
        let result = strip_foreign_script(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn script_cyrillic_majority_strips_latin() {
        // Cyrillic majority with a Latin hallucination line.
        let lines = vec![
            "[0:00] Привет мир".into(),
            "[0:05] Как дела".into(),
            "[0:10] Всё хорошо".into(),
            "[0:15] Hello world".into(), // Hallucinated Latin
        ];
        let result = strip_foreign_script(&lines);
        assert_eq!(result.len(), 3);
        assert!(!result.iter().any(|l| l.contains("Hello")));
    }

    #[test]
    fn script_classify_basic() {
        assert_eq!(classify_script("Hello world"), Script::Latin);
        assert_eq!(classify_script("スパイシー"), Script::Cjk);
        assert_eq!(classify_script("Привет"), Script::Other);
        assert_eq!(classify_script(""), Script::Unknown);
        assert_eq!(classify_script("123 !@#"), Script::Unknown);
    }

    #[test]
    fn clean_transcript_includes_script_filter() {
        let input =
            "[0:00] Hello world\n[0:05] Testing one two\n[0:10] スパイシー\n[0:15] All done\n";
        let (cleaned, stats) = clean_transcript(input);
        assert!(!cleaned.contains("スパイシー"));
        assert!(stats.after_script_filter < stats.after_interleaved_dedup);
    }

    // ── noise marker collapse ──

    #[test]
    fn noise_markers_collapses_polish_laughter() {
        // Polish whisper hallucination: [Śmiech] = laughter
        let mut lines: Vec<String> = vec!["[0:00] Cześć, jak się masz?".into()];
        for i in 1..=10 {
            lines.push(format!("[0:{:02}] [Śmiech]", i * 3));
        }
        lines.push("[0:33] Dobrze, dziękuję".into());

        let result = collapse_noise_markers(&lines);
        assert!(
            result.len() <= 4,
            "got {} lines: {:?}",
            result.len(),
            result
        );
        assert!(result[0].contains("Cześć"));
        assert!(result
            .iter()
            .any(|l| l.contains("non-speech audio removed")));
        assert!(result.last().unwrap().contains("Dobrze"));
    }

    #[test]
    fn noise_markers_collapses_english_mixed() {
        let lines = vec![
            "[0:00] Good morning everyone".into(),
            "[0:05] [music]".into(),
            "[0:10] [laughter]".into(),
            "[0:15] [applause]".into(),
            "[0:20] [music]".into(),
            "[0:25] Thank you for coming".into(),
        ];
        let result = collapse_noise_markers(&lines);
        assert!(
            result.len() <= 4,
            "got {} lines: {:?}",
            result.len(),
            result
        );
        assert!(result[0].contains("Good morning"));
        assert!(result.last().unwrap().contains("Thank you"));
    }

    #[test]
    fn noise_markers_preserves_short_runs() {
        // 1-2 markers should be kept (legitimate non-speech annotations)
        let lines = vec![
            "[0:00] Hello world".into(),
            "[0:05] [laughter]".into(),
            "[0:10] That was funny".into(),
        ];
        let result = collapse_noise_markers(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn noise_markers_handles_empty() {
        assert!(collapse_noise_markers(&[]).is_empty());
    }

    #[test]
    fn noise_markers_handles_single_line() {
        let lines = vec!["[0:00] [music]".into()];
        let result = collapse_noise_markers(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn noise_markers_handles_two_lines() {
        let lines = vec!["[0:00] [music]".into(), "[0:03] [laughter]".into()];
        let result = collapse_noise_markers(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn is_noise_marker_matches_parenthetical_form() {
        // Whisper hallucinates parenthetical non-speech tokens on near-silent
        // audio just as readily as bracketed ones. Both shapes count.
        assert!(is_noise_marker("(crying)"));
        assert!(is_noise_marker("(coughing)"));
        assert!(is_noise_marker("(applause)"));
        assert!(is_noise_marker("(silence)"));
        // Trailing period (whisper sometimes adds one) is tolerated.
        assert!(is_noise_marker("(crying)."));
    }

    #[test]
    fn is_noise_marker_matches_bracketed_form() {
        assert!(is_noise_marker("[music]"));
        assert!(is_noise_marker("[Growling]"));
        assert!(is_noise_marker("[BLANK_AUDIO]"));
        assert!(is_noise_marker("[Śmiech]"));
        assert!(is_noise_marker("[laughter]."));
    }

    #[test]
    fn is_noise_marker_rejects_non_markers() {
        assert!(!is_noise_marker(""));
        assert!(!is_noise_marker("Hello world"));
        assert!(!is_noise_marker("[0:00]"));
        // Collapse marker from a prior pass is not noise.
        assert!(!is_noise_marker("[...] [repeated audio removed - 3]"));
        // Mismatched delimiters are not a marker.
        assert!(!is_noise_marker("(crying]"));
        assert!(!is_noise_marker("[crying)"));
        // Too long / too many words.
        assert!(!is_noise_marker(
            "(this is way more than four words of content)"
        ));
    }

    #[test]
    fn is_noise_marker_rejects_user_authored_parentheticals() {
        // Real users put short parentheticals in notes for legitimate reasons.
        // None of these contain a whisper non-speech token, so they must NOT
        // be classified as noise (codex blocker 1 on PR #246).
        assert!(!is_noise_marker("(see attached)"));
        assert!(!is_noise_marker("(part 1)"));
        assert!(!is_noise_marker("(2 of 3)"));
        assert!(!is_noise_marker("(continued)"));
        assert!(!is_noise_marker("(TBD)"));
        assert!(!is_noise_marker("(draft)"));
        // Trailing period is normalized but the content is still not noise.
        assert!(!is_noise_marker("(see attached)."));
    }

    #[test]
    fn is_noise_marker_accepts_two_word_noise_forms() {
        // Two-word parentheticals where one word is on the noise allowlist
        // (typical whisper emission shape).
        assert!(is_noise_marker("(soft music)"));
        assert!(is_noise_marker("(loud applause)"));
        assert!(is_noise_marker("(gentle music)"));
        assert!(is_noise_marker("(background music)"));
        // Same in brackets.
        assert!(is_noise_marker("[soft music]"));
        assert!(is_noise_marker("[loud applause]"));
    }

    #[test]
    fn is_noise_marker_rejects_user_authored_brackets() {
        // Brackets that don't end with a noise word should also pass through.
        // (Less common in user notes than parentheticals, but the allowlist
        // dominance rule applies uniformly to both shapes.)
        assert!(!is_noise_marker("[TODO]"));
        assert!(!is_noise_marker("[draft]"));
        assert!(!is_noise_marker("[part 1]"));
        assert!(!is_noise_marker("[see attached]"));
    }

    #[test]
    fn is_noise_marker_dominance_check_rejects_noise_word_with_content_suffix() {
        // The noise allowlist used to match if ANY word inside the marker
        // appeared in the noise list. That meant `(music director)` and
        // `(applause sounds great)` were classified as noise just because
        // they contained a noise word somewhere. Per codex review of PR
        // #246: the last word must be the noise token.
        assert!(!is_noise_marker("(music director)"));
        assert!(!is_noise_marker("(applause sounds great)"));
        assert!(!is_noise_marker("(noise complaint)"));
        assert!(!is_noise_marker("(typing speed)"));
        assert!(!is_noise_marker("[crying baby]"));
        assert!(!is_noise_marker("[laughter therapy]"));
    }

    #[test]
    fn is_noise_marker_dominance_check_accepts_modifier_plus_noise_word() {
        // The complement of the previous test: when the noise word
        // terminates the phrase (whisper's actual emission shape), the
        // marker is still classified as noise.
        assert!(is_noise_marker("(audience laughter)"));
        assert!(is_noise_marker("(soft music)"));
        assert!(is_noise_marker("(loud applause)"));
        assert!(is_noise_marker("(background music)"));
        assert!(is_noise_marker("[audience laughter]"));
    }

    #[test]
    fn is_noise_marker_accepts_expanded_allowlist_tokens() {
        // Tokens added per codex review of PR #246: real whisper outputs
        // that were missing from the v1 allowlist.
        assert!(is_noise_marker("[inaudible]"));
        assert!(is_noise_marker("[crosstalk]"));
        assert!(is_noise_marker("[typing]"));
        assert!(is_noise_marker("[noise]"));
        assert!(is_noise_marker("[static]"));
        assert!(is_noise_marker("[beep]"));
        assert!(is_noise_marker("[ringing]"));
        assert!(is_noise_marker("(inaudible)"));
        assert!(is_noise_marker("(crosstalk)"));
    }

    // ── Known-hallucination phrase detection (issue #242) ────────────

    #[test]
    fn is_known_hallucination_matches_youtube_phrases() {
        // High-confidence YouTube subtitle leak signatures.
        assert!(is_known_hallucination("Thank you for watching!"));
        assert!(is_known_hallucination("Thank you for watching."));
        assert!(is_known_hallucination("Thank you for watching"));
        assert!(is_known_hallucination("THANK YOU FOR WATCHING"));
        assert!(is_known_hallucination("Please subscribe to our channel."));
        assert!(is_known_hallucination("Please subscribe to our channel!"));
        assert!(is_known_hallucination("Like and subscribe"));
        assert!(is_known_hallucination("Don't forget to subscribe."));
    }

    #[test]
    fn is_known_hallucination_matches_amara_phrases() {
        // Amara.org community subtitle hallucinations.
        assert!(is_known_hallucination(
            "Subtitles by the Amara.org community"
        ));
        assert!(is_known_hallucination(
            "Transcribed by the Amara.org community"
        ));
        assert!(is_known_hallucination("the Amara.org community"));
        assert!(is_known_hallucination("Amara.org community"));
    }

    #[test]
    fn is_known_hallucination_matches_url_lines() {
        // Bare-URL lines (no leading label) are dropped via is_url_line.
        assert!(is_known_hallucination("www.transcription-exe-project.com"));
        assert!(is_known_hallucination("https://amara.org"));
        assert!(is_known_hallucination("http://example.com"));
    }

    #[test]
    fn is_known_hallucination_matches_attribution_prefixes() {
        // Whisper emits attribution-style hallucinations with varied
        // trailing content (URLs, service names, additional filler).
        // Codex review of PR #247 v1 flagged that `is_url_line` alone
        // could not catch `Transcripted by: www.amara.org` because the
        // first token is "Transcripted", not the URL. The prefix list
        // catches the full family.
        assert!(is_known_hallucination(
            "Transcripted by: www.transcription-exe-project.com"
        ));
        assert!(is_known_hallucination("Transcripted by: www.amara.org"));
        assert!(is_known_hallucination("Captioned by Acme Captions"));
        assert!(is_known_hallucination(
            "Transcribed by the Amara.org community"
        ));
        assert!(is_known_hallucination("Subtitles by the cyclope team"));
        assert!(is_known_hallucination("Translated by community volunteers"));
        assert!(is_known_hallucination(
            "Captions by the Acme transcription service"
        ));
        // Bare prefix without trailing content also matches.
        assert!(is_known_hallucination("Transcribed by"));
        assert!(is_known_hallucination("Subtitles by"));
    }

    #[test]
    fn is_known_hallucination_rejects_real_speech_containing_phrase() {
        // Real human speech that contains a hallucination phrase as a
        // substring must NOT be classified as hallucination. Substring
        // matching would have produced unacceptable false positives.
        assert!(!is_known_hallucination(
            "Thank you for watching the demo carefully"
        ));
        assert!(!is_known_hallucination(
            "I would like to subscribe to that newsletter"
        ));
        assert!(!is_known_hallucination(
            "The Amara.org community has done great work, but our use case differs"
        ));
        assert!(!is_known_hallucination(
            "Check out www.example.com for the docs we discussed"
        ));
    }

    #[test]
    fn is_known_hallucination_rejects_normal_content() {
        assert!(!is_known_hallucination("Hello world"));
        assert!(!is_known_hallucination("Let's review the action items"));
        assert!(!is_known_hallucination("Thanks for joining the call"));
        assert!(!is_known_hallucination(""));
    }

    #[test]
    fn strip_known_hallucinations_drops_matching_lines() {
        // Mixed real content + Whisper training-data leaks. The leak lines
        // are dropped; real content is preserved verbatim including order.
        let lines: Vec<String> = vec![
            "[0:00] Real meeting content".into(),
            "[35:00] Thank you for watching!".into(),
            "[35:30] More real content here".into(),
            "[36:00] Subtitles by the Amara.org community".into(),
            "[36:30] www.transcription-exe-project.com".into(),
            "[37:00] Closing remarks from the team".into(),
        ];
        let result = strip_known_hallucinations(&lines);
        assert_eq!(result.len(), 3);
        assert!(result[0].contains("Real meeting content"));
        assert!(result[1].contains("More real content here"));
        assert!(result[2].contains("Closing remarks"));
    }

    #[test]
    fn strip_known_hallucinations_preserves_normal_transcript() {
        let lines: Vec<String> = vec![
            "[0:00] Hello everyone".into(),
            "[0:05] Let's get started".into(),
            "[0:10] We have three things to cover today".into(),
        ];
        let result = strip_known_hallucinations(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn clean_segments_strips_long_tail_hallucinations() {
        // The #242 failure shape: real content followed by hallucinated
        // tail with YouTube/Amara surface forms. After cleanup the tail
        // is gone, real content survives, and the post-hallucination-strip
        // count reflects the drop.
        let segments: Vec<String> = vec![
            "Real meeting content one".into(),
            "Real meeting content two".into(),
            "Thank you for watching!".into(),
            "Please subscribe to our channel".into(),
            "Subtitles by the Amara.org community".into(),
            "www.transcription-exe-project.com".into(),
            "Thank you for watching".into(),
        ];
        let (cleaned, stats) = clean_segments(&segments);
        // All 5 known-hallucination signature lines stripped.
        assert_eq!(stats.after_hallucination_strip, 2);
        assert!(cleaned.iter().all(|s| s.contains("Real meeting content")));
    }

    #[test]
    fn is_url_line_only_matches_url_prefix() {
        assert!(is_url_line("www.example.com"));
        assert!(is_url_line("https://amara.org"));
        assert!(is_url_line("http://example.com"));
        assert!(is_url_line("www.example.com path"));
        // Trailing punctuation or extra content is OK on first-token URL.
        assert!(!is_url_line("Check out www.example.com"));
        assert!(!is_url_line(""));
        assert!(!is_url_line("Hello"));
    }

    #[test]
    fn is_all_noise_true_for_pure_noise_transcript() {
        // The exact 1-2 line failure case from issue #241.
        let lines = vec!["[0:07] (crying)".into(), "[1:52] [Growling]".into()];
        assert!(is_all_noise(&lines));
    }

    #[test]
    fn is_all_noise_true_for_single_noise_line() {
        let lines = vec!["[0:00] [music]".into()];
        assert!(is_all_noise(&lines));
    }

    #[test]
    fn is_all_noise_false_when_any_line_is_speech() {
        let lines = vec![
            "[0:00] Hello world".into(),
            "[0:05] [laughter]".into(),
            "[0:10] (crying)".into(),
        ];
        assert!(!is_all_noise(&lines));
    }

    #[test]
    fn is_all_noise_false_on_empty_input() {
        // No surviving lines → nothing to call "all noise".
        let lines: Vec<String> = Vec::new();
        assert!(!is_all_noise(&lines));
    }

    #[test]
    fn is_all_noise_ignores_blank_lines() {
        let lines = vec![
            "".into(),
            "[0:07] (crying)".into(),
            "   ".into(),
            "[1:52] [Growling]".into(),
        ];
        assert!(is_all_noise(&lines));
    }

    #[test]
    fn clean_stats_all_noise_true_for_short_noise_only_input() {
        // Two lines is below the collapse-pass threshold, so the noise lines
        // survive cleanup unchanged. The all_noise signal must still fire.
        let input = vec!["[0:07] (crying)".into(), "[1:52] [Growling]".into()];
        let (cleaned, stats) = clean_segments(&input);
        // Cleanup is read-only here: short runs are preserved.
        assert_eq!(cleaned, input);
        assert!(stats.all_noise, "stats: {:?}", stats);
    }

    #[test]
    fn clean_stats_all_noise_false_with_real_content() {
        let input = vec![
            "[0:00] Hello world".into(),
            "[0:05] (crying)".into(),
            "[0:10] Goodbye".into(),
        ];
        let (_, stats) = clean_segments(&input);
        assert!(!stats.all_noise, "stats: {:?}", stats);
    }

    #[test]
    fn noise_markers_ignores_timestamps() {
        // Timestamps like [0:00] are NOT noise markers
        let lines = vec![
            "[0:00] Hello".into(),
            "[0:05] World".into(),
            "[0:10] Test".into(),
        ];
        let result = collapse_noise_markers(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn noise_markers_ignores_collapse_markers() {
        // Prior dedup pass markers should not be treated as noise
        let lines = vec![
            "[0:00] Hello world".into(),
            "[...] [repeated audio removed - 5 identical segments collapsed]".into(),
            "[0:30] Something else".into(),
            "[...] [hallucinated repetition removed - 10 lines collapsed]".into(),
            "[1:00] Final line".into(),
        ];
        let result = collapse_noise_markers(&lines);
        assert_eq!(result, lines);
    }

    #[test]
    fn noise_markers_multilingual_markers() {
        // Various languages' non-speech markers
        let mut lines = Vec::new();
        lines.push("[0:00] Bonjour".into());
        // French: [rires] = laughter, [musique] = music
        for i in 1..=4 {
            lines.push(format!("[0:{:02}] [rires]", i * 3));
        }
        // German: [Musik], [Gelächter]
        for i in 5..=7 {
            lines.push(format!("[0:{:02}] [Musik]", i * 3));
        }
        lines.push("[0:30] Au revoir".into());

        let result = collapse_noise_markers(&lines);
        assert!(
            result.len() <= 5,
            "got {} lines: {:?}",
            result.len(),
            result
        );
        assert!(result[0].contains("Bonjour"));
        assert!(result.last().unwrap().contains("Au revoir"));
    }

    #[test]
    fn noise_markers_scattered_high_density() {
        // Pass 2 fires at ≥66% ratio with ≥8 remaining markers after pass 1.
        // Use pairs of markers (runs of 2, below pass 1's threshold of 3)
        // interleaved with single content lines: 5 content + 10 markers = 66.7%.
        let lines = vec![
            "[0:00] Real content one".into(),
            "[0:03] [Śmiech]".into(),
            "[0:06] [muzyka]".into(),
            "[0:09] Real content two".into(),
            "[0:12] [cisza]".into(),
            "[0:15] [oklaski]".into(),
            "[0:18] Real content three".into(),
            "[0:21] [Śmiech]".into(),
            "[0:24] [muzyka]".into(),
            "[0:27] Real content four".into(),
            "[0:30] [cisza]".into(),
            "[0:33] [oklaski]".into(),
            "[0:36] Real content five".into(),
            "[0:39] [Śmiech]".into(),
            "[0:42] [muzyka]".into(),
        ];
        let result = collapse_noise_markers(&lines);
        // All 5 content lines should survive
        let content_count = result.iter().filter(|l| l.contains("Real content")).count();
        assert_eq!(content_count, 5, "all content lines preserved");
        // Pass 2 should have stripped the scattered markers
        assert!(
            result
                .iter()
                .any(|l| l.contains("non-speech markers removed")),
            "expected pass 2 removal summary, got: {:?}",
            result
        );
    }

    #[test]
    fn noise_markers_below_threshold_kept() {
        // 50% markers (5 of 10) - below the 66% threshold, all kept
        let lines = vec![
            "[0:00] Real content one".into(),
            "[0:03] [laughter]".into(),
            "[0:06] Real content two".into(),
            "[0:09] [applause]".into(),
            "[0:12] Real content three".into(),
            "[0:15] [laughter]".into(),
            "[0:18] Real content four".into(),
            "[0:21] [music]".into(),
            "[0:24] Real content five".into(),
            "[0:27] [laughter]".into(),
        ];
        let result = collapse_noise_markers(&lines);
        // No markers stripped - density is too low for pass 2
        assert_eq!(result, lines);
    }

    #[test]
    fn noise_markers_handles_blank_audio() {
        let mut lines: Vec<String> = vec!["[0:00] Some content".into()];
        for i in 1..=6 {
            lines.push(format!("[0:{:02}] [BLANK_AUDIO]", i * 5));
        }
        lines.push("[0:35] More content".into());

        let result = collapse_noise_markers(&lines);
        assert!(result.len() <= 4);
        assert!(result
            .iter()
            .any(|l| l.contains("non-speech audio removed")));
    }

    #[test]
    fn clean_transcript_includes_noise_markers() {
        // Use varied markers so consecutive dedup doesn't catch them first.
        // This ensures the noise marker layer has work to do.
        let input = "[0:00] Hello world\n\
            [0:03] [Śmiech]\n\
            [0:06] [muzyka]\n\
            [0:09] [cisza]\n\
            [0:12] [oklaski]\n\
            [0:15] [Śmiech]\n\
            [0:18] [muzyka]\n\
            [0:21] [cisza]\n\
            [0:24] Goodbye\n";

        let (cleaned, stats) = clean_transcript(input);
        // Noise marker filter runs after script filter; should have removed some lines
        assert!(
            stats.after_noise_markers < stats.after_script_filter,
            "noise markers: {}, script filter: {}",
            stats.after_noise_markers,
            stats.after_script_filter
        );
        assert!(cleaned.contains("Hello world"));
        assert!(cleaned.contains("Goodbye"));
    }

    // ── strip_trailing_commands ──

    #[test]
    fn strip_command_removes_stop_recording() {
        let lines = vec![
            "[0:00] Great meeting everyone".into(),
            "[0:05] Let's wrap up".into(),
            "[0:10] Stop recording.".into(),
        ];
        let result = strip_trailing_commands(&lines);
        assert_eq!(result.len(), 2);
        assert!(result[1].contains("wrap up"));
    }

    #[test]
    fn strip_command_removes_with_timestamp() {
        let lines = vec!["[0:00] First point".into(), "[0:30] Stop recording".into()];
        let result = strip_trailing_commands(&lines);
        assert_eq!(result.len(), 1);
        assert!(result[0].contains("First point"));
    }

    #[test]
    fn strip_command_removes_end_recording() {
        let lines = vec![
            "[0:00] Discussion content".into(),
            "[0:10] End recording".into(),
        ];
        let result = strip_trailing_commands(&lines);
        assert_eq!(result.len(), 1);
    }

    #[test]
    fn strip_command_removes_two_trailing_commands() {
        let lines = vec![
            "[0:00] Content".into(),
            "[0:10] Okay stop.".into(),
            "[0:12] Stop recording.".into(),
        ];
        let result = strip_trailing_commands(&lines);
        assert_eq!(result.len(), 1);
        assert!(result[0].contains("Content"));
    }

    #[test]
    fn strip_command_preserves_non_command_lines() {
        let lines = vec![
            "[0:00] We need to stop recording expenses".into(),
            "[0:05] The stop recording policy is important".into(),
        ];
        let result = strip_trailing_commands(&lines);
        assert_eq!(result.len(), 2, "non-command lines should be preserved");
    }

    #[test]
    fn strip_command_handles_empty() {
        let result = strip_trailing_commands(&[]);
        assert!(result.is_empty());
    }

    #[test]
    fn strip_command_case_insensitive() {
        let lines = vec![
            "[0:00] Meeting notes".into(),
            "[0:05] STOP RECORDING".into(),
        ];
        let result = strip_trailing_commands(&lines);
        assert_eq!(result.len(), 1);
    }

    #[test]
    fn clean_transcript_strips_trailing_command() {
        let input =
            "[0:00] Important discussion\n[0:05] Action item for Bob\n[0:10] Stop recording.\n";
        let (cleaned, stats) = clean_transcript(input);
        assert!(!cleaned.contains("Stop recording"));
        assert!(cleaned.contains("Action item for Bob"));
        // Command-strip runs before trim now, so after_command_strip is the
        // count BEFORE the (no-op) trim runs. Trim is a no-op here, so the two
        // counts match.
        assert!(stats.after_command_strip <= stats.after_trailing_trim);
        assert_eq!(stats.lines_removed, 1);
    }

    // ---- clean_segments + CleanOptions ----

    #[test]
    fn clean_segments_handles_empty() {
        let (cleaned, stats) = clean_segments(&[]);
        assert!(cleaned.is_empty());
        assert_eq!(stats.original_lines, 0);
        assert_eq!(stats.lines_removed, 0);
    }

    #[test]
    fn clean_segments_passes_through_clean_input() {
        let input: Vec<String> = vec![
            "Welcome to the meeting.".into(),
            "Let's discuss Q3 numbers.".into(),
            "Revenue is up twelve percent.".into(),
        ];
        let (cleaned, stats) = clean_segments(&input);
        assert_eq!(cleaned, input, "clean input should be untouched");
        assert_eq!(stats.lines_removed, 0);
        assert_eq!(stats.after_command_strip, 3);
    }

    #[test]
    fn clean_segments_dedups_repeated_hallucination() {
        let input: Vec<String> = vec![
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "What's the budget for Q3?".into(),
        ];
        let (cleaned, stats) = clean_segments(&input);
        // Real content survives; the hallucination loop collapses to
        // first occurrence + annotation line.
        assert!(cleaned.iter().any(|s| s.contains("budget")));
        assert!(stats.lines_removed >= 2);
        // Annotation line is inserted to mark what was collapsed.
        assert!(cleaned.iter().any(|s| s.contains("repeated audio removed")));
    }

    #[test]
    fn clean_segments_is_idempotent() {
        let input: Vec<String> = vec![
            "Real content.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "More real content.".into(),
        ];
        let (first, _) = clean_segments(&input);
        let (second, second_stats) = clean_segments(&first);
        assert_eq!(first, second, "second pass should be a no-op");
        assert_eq!(second_stats.lines_removed, 0);
    }

    #[test]
    fn clean_segments_with_options_respects_disabled_passes() {
        let input: Vec<String> = vec![
            "Hello.".into(),
            "Hello.".into(),
            "Hello.".into(),
            "Hello.".into(),
        ];
        // Disable consecutive dedup; everything else still runs.
        let opts = CleanOptions {
            dedup_consecutive: false,
            ..CleanOptions::default()
        };
        let (cleaned, _) = clean_segments_with_options(&input, &opts);
        assert_eq!(cleaned.len(), input.len(), "dedup disabled → no removal");
    }

    #[test]
    fn clean_options_none_runs_no_passes() {
        let input: Vec<String> = vec![
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Stop recording.".into(),
        ];
        let (cleaned, stats) = clean_segments_with_options(&input, &CleanOptions::none());
        assert_eq!(cleaned, input, "no passes → no changes");
        assert_eq!(stats.lines_removed, 0);
    }

    #[test]
    fn clean_options_all_matches_default() {
        // Same default config exercised two ways must produce the same output.
        let input: Vec<String> = vec![
            "Real meeting content.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "[music]".into(),
        ];
        let (default_out, default_stats) = clean_segments(&input);
        let (all_out, all_stats) = clean_segments_with_options(&input, &CleanOptions::all());
        assert_eq!(default_out, all_out);
        assert_eq!(default_stats, all_stats);
    }

    #[test]
    fn clean_segments_works_on_raw_segments_without_timestamps() {
        // The fork-user case: raw segments straight from whisper_state.get_segment(i).
        // No timestamp brackets. Cleaning should still work end-to-end.
        let raw_segments: Vec<String> = vec![
            " Thank you.".into(), // whisper segments often have leading space
            " Thank you.".into(),
            " Thank you.".into(),
            " Thank you.".into(),
            " So what's our action plan?".into(),
        ];
        let (cleaned, stats) = clean_segments(&raw_segments);
        assert!(stats.lines_removed >= 2);
        assert!(cleaned.iter().any(|s| s.contains("action plan")));
    }

    #[test]
    fn clean_transcript_delegates_to_clean_segments() {
        // Both entry points should produce the same logical output
        // for an input where formatting doesn't matter.
        let raw = "Thank you.\nThank you.\nThank you.\nReal content.";
        let segments: Vec<String> = raw.lines().map(String::from).collect();
        let (transcript_out, _t_stats) = clean_transcript(raw);
        let (segments_out, _s_stats) = clean_segments(&segments);
        assert_eq!(transcript_out, segments_out.join("\n"));
    }

    #[test]
    fn clean_stats_summary_is_human_readable() {
        let input: Vec<String> = vec![
            "Hello.".into(),
            "Hello.".into(),
            "Hello.".into(),
            "World.".into(),
        ];
        let (_, stats) = clean_segments(&input);
        let summary = stats.summary();
        assert!(summary.contains("whisper-guard:"));
        assert!(summary.contains("4")); // original count
    }

    #[test]
    fn clean_segments_with_huge_input_does_not_panic() {
        // Defensive: 10k segments, all identical, should not blow up.
        let input: Vec<String> = (0..10_000).map(|_| "Thank you.".to_string()).collect();
        let (cleaned, stats) = clean_segments(&input);
        assert_eq!(stats.original_lines, 10_000);
        assert!(cleaned.len() < 10);
    }

    #[test]
    fn clean_segments_handles_unicode_correctly() {
        // Mixed scripts within a single legitimate segment shouldn't trigger filtering.
        let input: Vec<String> = vec![
            "Café meeting at 9am with Søren and José".into(),
            "Discussed naïve Bayes models".into(),
        ];
        let (cleaned, _) = clean_segments(&input);
        assert_eq!(cleaned.len(), 2, "unicode-in-Latin should not be filtered");
    }

    #[test]
    fn keep_dedup_annotations_default_true_preserves_marker() {
        let input: Vec<String> = vec![
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Real content.".into(),
        ];
        let (cleaned, _) = clean_segments(&input);
        assert!(
            cleaned
                .iter()
                .any(|s| s.starts_with(DEDUP_ANNOTATION_PREFIX)),
            "default behavior should preserve the annotation line"
        );
    }

    #[test]
    fn keep_dedup_annotations_false_strips_marker() {
        let input: Vec<String> = vec![
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "Real content.".into(),
        ];
        let opts = CleanOptions {
            keep_dedup_annotations: false,
            ..CleanOptions::default()
        };
        let (cleaned, stats) = clean_segments_with_options(&input, &opts);
        assert!(
            !cleaned
                .iter()
                .any(|s| s.starts_with(DEDUP_ANNOTATION_PREFIX)),
            "annotation should be removed"
        );
        // With annotation suppressed, output is just "Thank you." + "Real content."
        // Net removed: 5 - 2 = 3.
        assert_eq!(cleaned.len(), 2);
        assert_eq!(stats.lines_removed, 3);
    }

    #[test]
    fn keep_dedup_annotations_does_not_strip_other_bracket_content() {
        // A real segment that happens to start with a bracket should NOT be filtered.
        let input: Vec<String> = vec![
            "Thank you.".into(),
            "Thank you.".into(),
            "Thank you.".into(),
            "[NAME] said the deal closed.".into(),
        ];
        let opts = CleanOptions {
            keep_dedup_annotations: false,
            ..CleanOptions::default()
        };
        let (cleaned, _) = clean_segments_with_options(&input, &opts);
        assert!(
            cleaned.iter().any(|s| s.contains("[NAME]")),
            "non-annotation bracket content must survive"
        );
    }
}