tokenix 0.45.0 - Docs.rs

use std::collections::HashMap;
use std::path::PathBuf;

use regex::Regex;
use rust_embed::Embed;
use serde::Deserialize;

#[derive(Debug, Deserialize, Clone)]
pub struct MatchOutput {
    pub pattern: String,
    pub message: String,
    /// Guard: when set, the short-circuit to `message` is skipped if the output
    /// also matches this regex. Prevents masking errors/warnings that appear
    /// alongside a success marker (e.g. "total size is" present, but so is "error").
    #[serde(default)]
    pub unless: Option<String>,
}

#[derive(Debug, Deserialize, Clone)]
pub struct FilterDef {
    #[allow(dead_code)]
    pub description: Option<String>,
    pub match_command: String,
    #[serde(default)]
    pub strip_ansi: bool,
    #[serde(default)]
    pub strip_lines_matching: Vec<String>,
    #[serde(default)]
    pub keep_lines_matching: Vec<String>,
    pub max_lines: Option<usize>,
    pub head_lines: Option<usize>,
    pub tail_lines: Option<usize>,
    pub on_empty: Option<String>,
    /// When the filter reduces *non-empty* command output to nothing — usually an
    /// unexpected output shape the keep/extract rules don't recognize (e.g.
    /// `git log --oneline` against a filter tuned for the full log format) — emit a
    /// bounded, ANSI-stripped view of the real output instead of `on_empty`.
    /// Opt-in so summary filters (cargo test all-pass → "ok") keep their behavior;
    /// only format-specific filters that would otherwise report a *false* "nothing
    /// here" (git-log, git-diff) set it.
    #[serde(default)]
    pub passthrough_when_emptied: bool,
    #[serde(default)]
    pub match_output: Vec<MatchOutput>,
    pub truncate_lines_at: Option<usize>,
    #[serde(default)]
    #[allow(dead_code)]
    pub filter_stderr: bool,

    /// Regex replacement rules: each entry is [pattern, replacement].
    /// Applied after line filtering, before sizing. Enables custom transformations
    /// like shortening paths, normalizing timestamps, etc.
    #[serde(default)]
    pub replace_patterns: Vec<[String; 2]>,

    /// Extract only content between start/end markers (inclusive).
    /// Useful for pulling out specific sections like test failures, error blocks, etc.
    #[serde(default)]
    pub extract_sections: Vec<ExtractSection>,

    /// Semantic filter: keep only lines semantically relevant to a query.
    /// Uses embeddings to score relevance. Requires daemon or in-process embed.
    #[serde(default)]
    pub semantic_filter: Option<SemanticFilterDef>,

    /// Deduplicate similar blocks (not just exact lines).
    /// Groups consecutive blocks by structural similarity.
    #[serde(default)]
    pub deduplicate_blocks: Option<DeduplicateBlocksDef>,

    /// Intelligent JSON summarization beyond simple compaction.
    /// Extracts key fields, summarizes arrays, preserves structure.
    #[serde(default)]
    pub summarize_json: Option<SummarizeJsonDef>,

    /// Hard token budget: truncate intelligently to stay under token limit.
    /// Prioritizes head/tail/errors/semantic relevance.
    pub token_budget: Option<usize>,
}

#[derive(Debug, Deserialize, Clone)]
pub struct ExtractSection {
    pub start_pattern: String,
    pub end_pattern: String,
    #[serde(default)]
    pub include_markers: bool,
    #[serde(default)]
    pub max_matches: Option<usize>,
}

#[derive(Debug, Deserialize, Clone)]
pub struct SemanticFilterDef {
    /// Query to score relevance against (e.g., "error", "test failure", "build output")
    pub query: String,
    /// Minimum cosine similarity to keep (0.0-1.0)
    #[serde(default = "default_semantic_threshold")]
    pub threshold: f32,
    /// Always keep lines matching these patterns regardless of score
    #[serde(default)]
    pub always_keep: Vec<String>,
    /// Model to use (defaults to index model)
    pub model: Option<String>,
}

fn default_semantic_threshold() -> f32 {
    0.3
}

#[derive(Debug, Deserialize, Clone)]
pub struct DeduplicateBlocksDef {
    /// Minimum lines per block to consider for deduplication
    #[serde(default = "default_min_block_lines")]
    pub min_block_lines: usize,
    /// Similarity threshold for block comparison (0.0-1.0)
    #[serde(default = "default_block_similarity")]
    pub similarity: f32,
    /// Regex to identify block boundaries (default: blank line)
    #[serde(default)]
    pub block_delimiter: Option<String>,
}

fn default_min_block_lines() -> usize {
    3
}

fn default_block_similarity() -> f32 {
    0.8
}

#[derive(Debug, Deserialize, Clone)]
pub struct SummarizeJsonDef {
    /// Max array elements to show before summarizing
    #[serde(default = "default_max_array_items")]
    pub max_array_items: usize,
    /// Max object depth to traverse
    #[serde(default = "default_max_depth")]
    pub max_depth: usize,
    /// Fields to always include (dot notation for nested)
    #[serde(default)]
    pub always_include: Vec<String>,
    /// Fields to exclude
    #[serde(default)]
    pub exclude: Vec<String>,
}

fn default_max_array_items() -> usize {
    10
}

fn default_max_depth() -> usize {
    3
}

#[derive(Debug, Deserialize)]
struct FilterFile {
    #[serde(default)]
    filters: HashMap<String, FilterDef>,
}

pub struct ActiveFilter {
    pub name: String,
    pub source: &'static str,
    pub filter: FilterDef,
}

#[derive(Embed)]
#[folder = "assets/filters"]
#[include = "*.toml"]
// Rebuild trigger for new filters
struct BundledFilters;

pub fn filters_dir() -> PathBuf {
    dirs::home_dir()
        .unwrap_or_else(|| PathBuf::from("."))
        .join(".tokenix")
        .join("filters")
}

fn parse_filter_file_named(content: &str) -> Vec<(String, FilterDef)> {
    toml::from_str::<FilterFile>(content)
        .map(|f| f.filters.into_iter().collect())
        .unwrap_or_default()
}

pub fn load_user_filters() -> Vec<FilterDef> {
    load_user_filters_named()
        .into_iter()
        .map(|(_, f)| f)
        .collect()
}

pub fn load_user_filters_named() -> Vec<(String, FilterDef)> {
    let dir = filters_dir();
    if !dir.exists() {
        return vec![];
    }
    let mut result = Vec::new();
    if let Ok(entries) = std::fs::read_dir(&dir) {
        for entry in entries.flatten() {
            let path = entry.path();
            if path.extension().and_then(|e| e.to_str()) == Some("toml") {
                if let Ok(content) = std::fs::read_to_string(&path) {
                    result.extend(parse_filter_file_named(&content));
                }
            }
        }
    }
    result
}

pub fn load_local_filters_named() -> Vec<(String, FilterDef)> {
    let cwd = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
    let root = crate::store::find_project_root(&cwd);
    let dir = root.join(".tokenix").join("filters");
    if !dir.exists() {
        return vec![];
    }
    let mut result = Vec::new();
    if let Ok(entries) = std::fs::read_dir(&dir) {
        for entry in entries.flatten() {
            let path = entry.path();
            if path.extension().and_then(|e| e.to_str()) == Some("toml") {
                if let Ok(content) = std::fs::read_to_string(&path) {
                    result.extend(parse_filter_file_named(&content));
                }
            }
        }
    }
    result
}

pub fn load_local_filters() -> Vec<FilterDef> {
    load_local_filters_named()
        .into_iter()
        .map(|(_, f)| f)
        .collect()
}

pub fn load_bundled_filters() -> Vec<FilterDef> {
    load_bundled_filters_named()
        .into_iter()
        .map(|(_, f)| f)
        .collect()
}

pub fn load_bundled_filters_named() -> Vec<(String, FilterDef)> {
    BundledFilters::iter()
        .filter_map(|name| {
            let file = BundledFilters::get(&name)?;
            let content = std::str::from_utf8(file.data.as_ref()).ok()?;
            Some(parse_filter_file_named(content))
        })
        .flatten()
        .collect()
}

/// First embedded `[[tests.<name>]].input` for each bundled filter, keyed by
/// filter name. The TUI uses these as representative sample output to preview how
/// a filter transforms input (input → apply_filter → output).
pub fn sample_inputs() -> HashMap<String, String> {
    #[derive(Deserialize)]
    struct SampleCase {
        input: String,
    }
    #[derive(Deserialize)]
    struct TestsOnly {
        #[serde(default)]
        tests: HashMap<String, Vec<SampleCase>>,
    }
    let mut map = HashMap::new();
    for name in BundledFilters::iter() {
        let Some(file) = BundledFilters::get(&name) else {
            continue;
        };
        let Ok(content) = std::str::from_utf8(file.data.as_ref()) else {
            continue;
        };
        if let Ok(parsed) = toml::from_str::<TestsOnly>(content) {
            for (fname, cases) in parsed.tests {
                if let Some(first) = cases.into_iter().next() {
                    map.entry(fname).or_insert(first.input);
                }
            }
        }
    }
    map
}

/// Bundled-filter inventory for `tokenix doctor`: total embedded golden test
/// cases across every bundled filter file.
pub fn bundled_test_case_count() -> usize {
    #[derive(Deserialize)]
    struct TestsOnly {
        #[serde(default)]
        tests: HashMap<String, Vec<toml::Value>>,
    }
    let mut count = 0;
    for name in BundledFilters::iter() {
        let Some(file) = BundledFilters::get(&name) else {
            continue;
        };
        let Ok(content) = std::str::from_utf8(file.data.as_ref()) else {
            continue;
        };
        if let Ok(parsed) = toml::from_str::<TestsOnly>(content) {
            count += parsed.tests.values().map(|v| v.len()).sum::<usize>();
        }
    }
    count
}

pub fn load_active_filters() -> Vec<ActiveFilter> {
    let mut result: Vec<ActiveFilter> = load_local_filters_named()
        .into_iter()
        .map(|(name, filter)| ActiveFilter {
            name,
            source: "local",
            filter,
        })
        .collect();
    result.extend(
        load_user_filters_named()
            .into_iter()
            .map(|(name, filter)| ActiveFilter {
                name,
                source: "user",
                filter,
            }),
    );
    result.extend(
        load_bundled_filters_named()
            .into_iter()
            .map(|(name, filter)| ActiveFilter {
                name,
                source: "bundled",
                filter,
            }),
    );
    result
}

/// Returns local filters (highest priority), then user filters, then bundled filters as fallback.
pub fn load_all_filters() -> Vec<FilterDef> {
    let mut all = load_local_filters();
    all.extend(load_user_filters());
    all.extend(load_bundled_filters());
    all
}

/// Config problems in a filter's semantic_filter section (unknown model,
/// out-of-range threshold). Used by `tokenix doctor`; empty = healthy.
pub fn semantic_filter_issues(f: &FilterDef) -> Vec<String> {
    let mut issues = Vec::new();
    if let Some(sem) = &f.semantic_filter {
        if let Some(model) = &sem.model {
            if !crate::embed::is_known_model(model) {
                issues.push(format!(
                    "semantic_filter.model '{}' unknown — falls back to '{}'",
                    model,
                    crate::embed::DEFAULT_MODEL_ID
                ));
            }
        }
        if !(0.0..=1.0).contains(&sem.threshold) {
            issues.push(format!(
                "semantic_filter.threshold {} outside 0.0-1.0",
                sem.threshold
            ));
        }
    }
    issues
}

pub fn find_filter<'a>(cmd: &str, filters: &'a [FilterDef]) -> Option<&'a FilterDef> {
    let effective = get_effective_command(cmd);
    let tokens = tokenize_command(&effective);

    // 1. Help flags bypass: let raw help outputs pass through unfiltered
    let has_help = tokens.iter().any(|t| {
        let t_lower = t.to_lowercase();
        t == "-h"
            || t == "-help"
            || t_lower == "--help"
            || t == "/h"
            || t == "/?"
            || t_lower == "help"
            || t_lower.starts_with("--help-")
            || t_lower.starts_with("-help-")
    });
    if has_help {
        return None;
    }

    // 2. Version flags bypass: version output is short and shouldn't be masked as success.
    // Lowercase `-v` is treated as a version flag only for tools where it actually queries version
    // (e.g. git, node, docker, npm), to avoid falsely bypassing verbose runs (e.g. cargo, pytest, python).
    let mut has_version = false;
    for (i, t) in tokens.iter().enumerate() {
        let t_lower = t.to_lowercase();
        if t_lower == "--version" || t_lower == "version" {
            has_version = true;
            break;
        }
        if t == "-V" {
            has_version = true;
            break;
        }
        if t == "-v" && i > 0 {
            let prev_tool = std::path::Path::new(&tokens[i - 1])
                .file_name()
                .and_then(|f| f.to_str())
                .unwrap_or(&tokens[i - 1])
                .to_lowercase();
            let prev_tool = prev_tool.strip_suffix(".exe").unwrap_or(&prev_tool);
            if matches!(
                prev_tool,
                "git" | "node" | "docker" | "npm" | "npx" | "pnpm" | "yarn" | "bun"
            ) {
                has_version = true;
                break;
            }
        }
    }
    if !has_version && !tokens.is_empty() && (tokens[0] == "version" || tokens[0] == "--version") {
        has_version = true;
    }

    if has_version && tokens.len() <= 3 {
        return None;
    }

    // 3. Debug/verbose flags bypass: keep troubleshooting logs intact
    let has_debug_or_verbose = tokens.iter().any(|t| {
        let t_lower = t.to_lowercase();
        t == "-vv"
            || t == "-vvv"
            || t_lower == "--debug"
            || t_lower == "--verbose"
            || t_lower == "--trace"
            || t_lower.starts_with("--log-level=debug")
            || t_lower.starts_with("--log-level=trace")
    });
    if has_debug_or_verbose {
        return None;
    }

    // 4. YAML check to prevent breaking YAML outputs
    let has_yaml = tokens.iter().any(|t| {
        let t_lower = t.to_lowercase();
        t_lower == "--yaml" || t_lower == "-o=yaml" || t_lower == "yaml"
    }) || effective.contains("-o yaml")
        || effective.contains("--format yaml")
        || effective.contains("--format=yaml");
    if has_yaml {
        return None;
    }

    // 5. JSON check to prevent breaking JSON outputs unless explicitly handled by the filter
    let has_json = tokens.iter().enumerate().any(|(i, t)| {
        let t_lower = t.to_lowercase();
        if t_lower == "--json" || t_lower == "-o=json" || t_lower == "json" {
            true
        } else if t == "-j" {
            if i > 0 {
                let prev_tool = std::path::Path::new(&tokens[i - 1])
                    .file_name()
                    .and_then(|f| f.to_str())
                    .unwrap_or(&tokens[i - 1])
                    .to_lowercase();
                let prev_tool = prev_tool.strip_suffix(".exe").unwrap_or(&prev_tool);
                !matches!(
                    prev_tool,
                    "cargo" | "make" | "ninja" | "cmake" | "mvn" | "gradle" | "build"
                )
            } else {
                true
            }
        } else {
            false
        }
    }) || effective.contains("-o json")
        || effective.contains("--format json")
        || effective.contains("--format=json")
        || effective.contains("--message-format=json")
        || effective.contains("--message-format json");

    let shell_body = unwrap_shell_runner(cmd);
    let base = shell_body.as_deref().unwrap_or(cmd);
    let segments = split_on_operators(base);

    let mut prioritized_candidates = Vec::new();

    // 1. Segment-level candidates: last segment first
    for segment in segments.iter().rev() {
        let effective = get_effective_command(segment);
        push_unique(&mut prioritized_candidates, &effective);
        push_unique(&mut prioritized_candidates, segment);
    }

    // 2. Full compound candidates
    let effective_full = get_effective_command(cmd);
    push_unique(&mut prioritized_candidates, &effective_full);
    if let Some(body) = &shell_body {
        let effective_body = get_effective_command(body);
        push_unique(&mut prioritized_candidates, &effective_body);
        push_unique(&mut prioritized_candidates, body);
    }
    push_unique(&mut prioritized_candidates, cmd);

    // Find the first filter that matches any prioritized candidate, resolving collisions
    // by picking the one with the longest (most specific) match_command pattern.
    for candidate in &prioritized_candidates {
        let mut best_match: Option<&'a FilterDef> = None;
        let mut max_len = 0;

        for f in filters {
            if let Ok(re) = Regex::new(&f.match_command) {
                if re.is_match(candidate) {
                    // JSON bypass: if command asks for JSON output, but filter does not support summarizing JSON, bypass
                    if has_json && f.summarize_json.is_none() {
                        continue;
                    }
                    let pattern_len = f.match_command.len();
                    if pattern_len > max_len {
                        max_len = pattern_len;
                        best_match = Some(f);
                    }
                }
            }
        }
        if let Some(f) = best_match {
            return Some(f);
        }
    }
    None
}

pub fn tokenize_command(command: &str) -> Vec<String> {
    let mut tokens = Vec::new();
    let mut current = String::new();
    let mut quote: Option<char> = None;
    let mut escaping = false;

    for c in command.trim().chars() {
        if escaping {
            current.push(c);
            escaping = false;
            continue;
        }

        if c == '\\' {
            escaping = true;
            continue;
        }

        if let Some(q) = quote {
            if c == q {
                quote = None;
            } else {
                current.push(c);
            }
            continue;
        }

        if c == '\'' || c == '"' {
            quote = Some(c);
            continue;
        }

        if c.is_whitespace() {
            if !current.is_empty() {
                tokens.push(current);
                current = String::new();
            }
            continue;
        }

        current.push(c);
    }

    if escaping {
        current.push('\\');
    }

    if !current.is_empty() {
        tokens.push(current);
    }

    tokens
}

pub fn unwrap_shell_runner(cmd: &str) -> Option<String> {
    let argv = tokenize_command(cmd);
    if argv.is_empty() {
        return None;
    }

    let first = &argv[0];
    let first_path = std::path::Path::new(first);
    let launcher_name = first_path
        .file_name()
        .and_then(|f| f.to_str())
        .unwrap_or(first)
        .to_lowercase();
    let launcher_name_no_ext = launcher_name.strip_suffix(".exe").unwrap_or(&launcher_name);

    let is_shell = matches!(
        launcher_name_no_ext,
        "bash"
            | "sh"
            | "zsh"
            | "fish"
            | "dash"
            | "ksh"
            | "mksh"
            | "ash"
            | "csh"
            | "tcsh"
            | "cmd"
            | "powershell"
            | "pwsh"
    );

    if !is_shell {
        return None;
    }

    for i in 1..(argv.len().saturating_sub(1)) {
        let arg = &argv[i];
        let is_command_flag = if launcher_name_no_ext == "cmd" {
            arg.eq_ignore_ascii_case("/c") || arg.eq_ignore_ascii_case("-c")
        } else if launcher_name_no_ext == "powershell" || launcher_name_no_ext == "pwsh" {
            arg.eq_ignore_ascii_case("-c")
                || arg.eq_ignore_ascii_case("-command")
                || arg.eq_ignore_ascii_case("--command")
        } else {
            arg.starts_with('-') && arg.contains('c')
        };

        if is_command_flag {
            return Some(argv[i + 1].trim().to_string());
        }
    }

    None
}

fn is_env_assignment(s: &str) -> bool {
    let bytes = s.as_bytes();
    if bytes.is_empty() {
        return false;
    }
    if !bytes[0].is_ascii_alphabetic() && bytes[0] != b'_' {
        return false;
    }
    let mut i = 1;
    while i < bytes.len() {
        if bytes[i] == b'=' {
            return i > 0;
        }
        if !bytes[i].is_ascii_alphanumeric() && bytes[i] != b'_' {
            return false;
        }
        i += 1;
    }
    false
}

fn strip_leading_env_assignments(argv: &[String]) -> Vec<String> {
    let mut index = 0;
    while index < argv.len() && is_env_assignment(&argv[index]) {
        index += 1;
    }

    if index < argv.len() {
        let cmd_path = std::path::Path::new(&argv[index]);
        let cmd_name = cmd_path
            .file_name()
            .and_then(|f| f.to_str())
            .unwrap_or(&argv[index]);
        if cmd_name == "env" || cmd_name == "cross-env" {
            index += 1;
            while index < argv.len() {
                let arg = &argv[index];
                if arg == "--" {
                    index += 1;
                    break;
                }
                if is_env_assignment(arg) {
                    index += 1;
                    continue;
                }
                if arg == "-i" || arg == "-0" || arg == "--ignore-environment" || arg == "--debug" {
                    index += 1;
                    continue;
                }
                if arg == "-u"
                    || arg == "--unset"
                    || arg == "-C"
                    || arg == "--chdir"
                    || arg == "-S"
                    || arg == "--split-string"
                {
                    index += 2;
                    continue;
                }
                if arg.starts_with("--unset=")
                    || arg.starts_with("--chdir=")
                    || arg.starts_with("--split-string=")
                {
                    index += 1;
                    continue;
                }
                break;
            }
        }
    }

    argv[index..].to_vec()
}

/// Strip known Unix command-timing / resource-limit wrappers that prefix the
/// real command without altering its behaviour for filter-matching purposes:
///
/// - `timeout [OPTS] DURATION CMD`  (GNU coreutils)
/// - `time CMD`
/// - `nice [-n N] CMD`
/// - `ionice [-c C] [-n N] [-t] CMD`
///
/// Each wrapper is peeled in a loop so stacked prefixes like
/// `timeout 30 nice -n 10 pnpm run test` resolve to `pnpm run test`.
fn strip_leading_wrappers(argv: &[String]) -> Vec<String> {
    let mut index = 0;

    loop {
        if index >= argv.len() {
            break;
        }
        let name_raw = std::path::Path::new(&argv[index])
            .file_name()
            .and_then(|f| f.to_str())
            .unwrap_or(&argv[index])
            .to_lowercase();
        let name = name_raw.strip_suffix(".exe").unwrap_or(&name_raw);

        match name {
            // `& CMD` — PowerShell call operator
            "&" => {
                index += 1;
            }
            // `time CMD` — single token prefix
            "time" => {
                index += 1;
            }
            // `nice [-n N | --adjustment[=]N] CMD`
            "nice" => {
                index += 1;
                if index < argv.len() {
                    let a = &argv[index];
                    if a == "-n" || a == "--adjustment" {
                        index += 2;
                    } else if a.starts_with("--adjustment=") {
                        index += 1;
                    }
                }
            }
            // `ionice [-c C] [-n N] [-t] CMD`
            "ionice" => {
                index += 1;
                while index < argv.len() {
                    let a = &argv[index];
                    if (a == "-c" || a == "-n") && index + 1 < argv.len() {
                        index += 2;
                    } else if a == "-t" {
                        index += 1;
                    } else {
                        break;
                    }
                }
            }
            // `timeout [OPTS] DURATION CMD`
            // Options: --foreground, --preserve-status, --verbose, -k DUR, -s SIG
            "timeout" => {
                index += 1;
                let mut found_duration = false;
                while index < argv.len() {
                    let a = &argv[index];
                    if matches!(
                        a.as_str(),
                        "--foreground" | "--preserve-status" | "--verbose"
                    ) {
                        index += 1;
                        continue;
                    }
                    if (a == "-k" || a == "--kill-after" || a == "-s" || a == "--signal")
                        && index + 1 < argv.len()
                    {
                        index += 2;
                        continue;
                    }
                    if a.starts_with("--kill-after=") || a.starts_with("--signal=") {
                        index += 1;
                        continue;
                    }
                    if a.starts_with('-') {
                        index += 1;
                        continue;
                    }
                    // First non-option argument is the DURATION — skip it.
                    index += 1;
                    found_duration = true;
                    break;
                }
                if !found_duration {
                    // Malformed `timeout` invocation — stop peeling.
                    break;
                }
            }
            _ => break,
        }
    }

    argv[index..].to_vec()
}

/// Drop a leading package-runner prefix so the inner tool's filter matches:
/// `uv run pytest` -> `pytest`, `python -m ruff check` -> `ruff check`,
/// `bunx biome check` -> `biome check`, `npx tsc` -> `tsc`,
/// `pnpm exec eslint` / `pnpm dlx`/`yarn dlx`/`bun x`/`deno run`/`deno task`.
/// Returns the tail after the runner, or the input unchanged.
fn strip_package_runner(argv: &[String]) -> &[String] {
    if argv.is_empty() {
        return argv;
    }
    let t0 = std::path::Path::new(&argv[0])
        .file_name()
        .and_then(|f| f.to_str())
        .unwrap_or(&argv[0])
        .to_lowercase();
    let t0 = t0.strip_suffix(".exe").unwrap_or(&t0);

    let mut start_idx = 0;

    // 1. Detect the runner and set the start index after the runner tokens
    if matches!(t0, "npx" | "bunx" | "uvx") && argv.len() > 1 {
        start_idx = 1;
    } else if argv.len() > 2 {
        let t1 = argv[1].as_str();
        let pair = (t0, t1);
        if matches!(
            pair,
            ("uv", "run")
                | ("pnpm", "exec")
                | ("pnpm", "dlx")
                | ("yarn", "dlx")
                | ("bun", "x")
                | ("deno", "run")
                | ("deno", "task")
                | ("bundle", "exec")
        ) || (matches!(t0, "python" | "python3" | "py") && t1 == "-m")
        {
            start_idx = 2;
        }
    }

    if start_idx == 0 {
        return argv;
    }

    // 2. Skip any options/flags belonging to the runner itself (e.g. npx --no-install, uv run --with requests)
    let mut idx = start_idx;
    while idx < argv.len() {
        let arg = &argv[idx];
        if arg.starts_with('-') {
            if arg == "--" {
                idx += 1;
                break; // double dash indicates end of runner options
            }
            if (arg == "-p" || arg == "--package" || arg == "--with" || arg == "--import")
                && idx + 1 < argv.len()
            {
                idx += 2;
            } else {
                idx += 1;
            }
        } else {
            break;
        }
    }

    if idx < argv.len() {
        &argv[idx..]
    } else {
        &argv[start_idx..] // fallback
    }
}

/// Drop tool-global options that sit *between* a subcommand tool and its
/// subcommand, so a filter anchored on `^git\s+add` still matches
/// `git -C /repo -c user.name=x add .`. Without this, idiomatic invocations
/// like `git -C dir`, `kubectl -n ns`, `docker -H host` or `cargo +nightly`
/// bypass every subcommand filter and ship raw output.
///
/// Returns the tool name followed by the first non-option token onward, e.g.
/// `["git", "-C", "dir", "add", "."]` -> `["git", "add", "."]`. Tools not in
/// the recognized set are returned unchanged.
fn strip_subcommand_global_opts(argv: &[String]) -> Vec<String> {
    if argv.is_empty() {
        return Vec::new();
    }
    let tool = std::path::Path::new(&argv[0])
        .file_name()
        .and_then(|f| f.to_str())
        .unwrap_or(&argv[0])
        .to_lowercase();
    let tool = tool.strip_suffix(".exe").unwrap_or(&tool);

    // (flags taking a following value, boolean flags)
    let (valued, boolean): (&[&str], &[&str]) = match tool {
        "git" => (
            &[
                "-C",
                "-c",
                "--git-dir",
                "--work-tree",
                "--exec-path",
                "--namespace",
                "--super-prefix",
            ],
            &[
                "-p",
                "-P",
                "--paginate",
                "--no-pager",
                "--bare",
                "--no-replace-objects",
                "--literal-pathspecs",
                "--glob-pathspecs",
                "--noglob-pathspecs",
                "--icase-pathspecs",
                "--no-optional-locks",
            ],
        ),
        "kubectl" => (
            &[
                "-n",
                "--namespace",
                "--context",
                "--kubeconfig",
                "-s",
                "--server",
                "--token",
                "--as",
                "--as-group",
                "--cluster",
                "--user",
                "--cache-dir",
                "--request-timeout",
                "--client-certificate",
                "--client-key",
                "--certificate-authority",
                "--tls-server-name",
            ],
            &["--insecure-skip-tls-verify"],
        ),
        "docker" => (
            &[
                "-H",
                "--host",
                "--context",
                "--config",
                "-l",
                "--log-level",
                "--tlscacert",
                "--tlscert",
                "--tlskey",
            ],
            &["-D", "--debug", "--tls", "--tlsverify"],
        ),
        "cargo" => (&[], &[]),
        "pnpm" => (
            &[
                "-C",
                "--dir",
                "--filter",
                "--reporter",
                "--store-dir",
                "--virtual-store-dir",
                "--loglevel",
            ],
            &[
                "-w",
                "--workspace",
                "-r",
                "--recursive",
                "--prod",
                "-D",
                "--dev",
                "--no-optional",
                "--frozen-lockfile",
                "--silent",
            ],
        ),
        "bun" => (
            &[
                "--cwd",
                "-c",
                "--config",
                "--filter",
                "-p",
                "--port",
                "--env-file",
                "--profile",
            ],
            &[
                "--watch",
                "--hot",
                "--smol",
                "--no-buffer",
                "-v",
                "--version",
            ],
        ),
        _ => return argv.to_vec(),
    };

    let mut i = 1;
    while i < argv.len() {
        let a = &argv[i];
        // `cargo +nightly test`
        if tool == "cargo" && a.starts_with('+') {
            i += 1;
            continue;
        }
        if a.starts_with("--") {
            if a.contains('=') {
                i += 1; // --opt=value
                continue;
            }
            if valued.contains(&a.as_str()) {
                i += if i + 1 < argv.len() { 2 } else { 1 }; // --opt value
                continue;
            }
            if boolean.contains(&a.as_str()) {
                i += 1;
                continue;
            }
            break;
        } else if a.len() >= 2 && a.starts_with('-') {
            if valued.contains(&a.as_str()) {
                i += if i + 1 < argv.len() { 2 } else { 1 }; // -C dir
                continue;
            }
            if boolean.contains(&a.as_str()) {
                i += 1;
                continue;
            }
            break;
        } else {
            break; // subcommand reached
        }
    }

    if i == 1 {
        return argv.to_vec();
    }
    let mut out = Vec::with_capacity(1 + argv.len() - i);
    out.push(argv[0].clone());
    out.extend_from_slice(&argv[i..]);
    out
}

fn strip_cd_and_operators(mut argv: &[String]) -> &[String] {
    for _ in 0..8 {
        if argv.is_empty() {
            break;
        }
        let first = &argv[0];
        if first == "cd" || first == "pushd" {
            if argv.len() >= 2 && (argv[1] == "&&" || argv[1] == ";") {
                argv = &argv[2..];
                continue;
            }
            if argv.len() >= 3 && (argv[2] == "&&" || argv[2] == ";") {
                argv = &argv[3..];
                continue;
            }
        }
        break;
    }
    argv
}

pub fn get_effective_command(cmd: &str) -> String {
    let mut current = cmd.trim().to_string();

    for _ in 0..16 {
        let unwrapped = unwrap_shell_runner(&current);
        if let Some(inner) = unwrapped {
            current = inner;
            continue;
        }

        let tokens = tokenize_command(&current);
        if tokens.is_empty() {
            break;
        }

        let stripped_env = strip_leading_env_assignments(&tokens);
        let stripped_wrappers = strip_leading_wrappers(&stripped_env);
        let stripped_cd = strip_cd_and_operators(&stripped_wrappers);
        let stripped_runner = strip_package_runner(stripped_cd);
        let stripped_opts = strip_subcommand_global_opts(stripped_runner);

        if stripped_opts.len() == tokens.len() {
            break;
        }

        current = stripped_opts.join(" ");
    }

    current
}

/// Split a shell command into segments on the operators `&&`, `||`, `;` and the
/// pipe `|`, quote- and escape-aware. Operators are recognized regardless of
/// surrounding whitespace, so `a;b` and `a ; b` segment identically.
/// Quoted operators (e.g. `echo "a;b"`) are left intact.
pub fn split_on_operators(cmd: &str) -> Vec<String> {
    let mut segments = Vec::new();
    let mut current = String::new();
    let mut quote: Option<char> = None;
    let mut escaping = false;

    let chars: Vec<char> = cmd.chars().collect();
    let mut i = 0;
    while i < chars.len() {
        let c = chars[i];

        if escaping {
            current.push(c);
            escaping = false;
            i += 1;
            continue;
        }
        if c == '\\' {
            current.push(c);
            escaping = true;
            i += 1;
            continue;
        }
        if let Some(q) = quote {
            current.push(c);
            if c == q {
                quote = None;
            }
            i += 1;
            continue;
        }
        if c == '\'' || c == '"' {
            quote = Some(c);
            current.push(c);
            i += 1;
            continue;
        }

        let next = chars.get(i + 1).copied();
        // Two-char operators `&&` / `||` first, so the trailing `|` of `||`
        // is not mistaken for a pipe split.
        if (c == '&' && next == Some('&')) || (c == '|' && next == Some('|')) {
            push_segment(&mut segments, &mut current);
            i += 2;
            continue;
        }
        if c == ';' || c == '|' {
            push_segment(&mut segments, &mut current);
            i += 1;
            continue;
        }

        current.push(c);
        i += 1;
    }
    push_segment(&mut segments, &mut current);
    segments
}

fn push_segment(segments: &mut Vec<String>, current: &mut String) {
    let trimmed = current.trim();
    if !trimmed.is_empty() {
        segments.push(trimmed.to_string());
    }
    current.clear();
}

fn push_unique(candidates: &mut Vec<String>, candidate: &str) {
    let trimmed = candidate.trim();
    if !trimmed.is_empty() && !candidates.iter().any(|c| c == trimmed) {
        candidates.push(trimmed.to_string());
    }
}

#[cfg(test)]
pub fn derive_command_candidates(cmd: &str) -> Vec<String> {
    let mut candidates = Vec::new();

    push_unique(&mut candidates, cmd);

    let shell_body = unwrap_shell_runner(cmd);
    if let Some(body) = &shell_body {
        push_unique(&mut candidates, body);
    }

    push_unique(&mut candidates, &get_effective_command(cmd));

    // Operator-aware segmentation: split compound commands and add
    // each segment plus its effective form, so a filter anchored on its base
    // command matches regardless of position or spacing — e.g. `cd x;gitleaks`,
    // `npm i && gitleaks`, or `producer | gitleaks`.
    let mut bases = vec![cmd.to_string()];
    if let Some(body) = shell_body {
        bases.push(body);
    }
    for base in &bases {
        for segment in split_on_operators(base) {
            let effective = get_effective_command(&segment);
            push_unique(&mut candidates, &segment);
            push_unique(&mut candidates, &effective);
        }
    }

    candidates
}

pub fn apply_filter(output: &str, f: &FilterDef) -> String {
    // match_output short-circuits before any other transformation
    for mo in &f.match_output {
        if let Ok(re) = Regex::new(&mo.pattern) {
            if re.is_match(output) {
                // `unless` guard: do not short-circuit when the output also matches
                // this pattern, so errors/warnings are never masked as success.
                if let Some(unless) = &mo.unless {
                    if Regex::new(unless)
                        .map(|u| u.is_match(output))
                        .unwrap_or(false)
                    {
                        continue;
                    }
                }
                return mo.message.clone();
            }
        }
    }

    let s = if f.strip_ansi {
        crate::compress::strip_ansi(output)
    } else {
        output.to_string()
    };

    let mut lines: Vec<String> = s.lines().map(|l| l.to_string()).collect();

    if !f.strip_lines_matching.is_empty() {
        let patterns: Vec<Regex> = f
            .strip_lines_matching
            .iter()
            .filter_map(|p| Regex::new(p).ok())
            .collect();
        lines.retain(|l| !patterns.iter().any(|re| re.is_match(l)));
    }

    if !f.keep_lines_matching.is_empty() {
        let patterns: Vec<Regex> = f
            .keep_lines_matching
            .iter()
            .filter_map(|p| Regex::new(p).ok())
            .collect();
        lines.retain(|l| patterns.iter().any(|re| re.is_match(l)));
    }

    // NEW: extract_sections - extract content between markers
    if !f.extract_sections.is_empty() {
        lines = apply_extract_sections(lines, &f.extract_sections);
    }

    // NEW: replace_patterns - regex replacements
    if !f.replace_patterns.is_empty() {
        lines = apply_replace_patterns(lines, &f.replace_patterns);
    }

    // NEW: deduplicate_blocks - structural deduplication
    if let Some(dedup) = &f.deduplicate_blocks {
        lines = apply_deduplicate_blocks(lines, dedup);
    }

    // NEW: semantic_filter - embedding-based relevance filtering
    if let Some(semantic) = &f.semantic_filter {
        lines = apply_semantic_filter(lines, semantic);
    }

    // NEW: summarize_json - intelligent JSON summarization
    if let Some(summarize) = &f.summarize_json {
        lines = apply_summarize_json(lines, summarize);
    }

    let lines = apply_sizing(lines, f);

    // NEW: token_budget - hard token limit with smart truncation
    let mut result = if let Some(max_len) = f.truncate_lines_at {
        lines
            .iter()
            .map(|l| truncate_at_char_boundary(l, max_len))
            .collect::<Vec<_>>()
            .join("\n")
    } else {
        lines.join("\n")
    };

    if let Some(budget) = f.token_budget {
        result = apply_token_budget(&result, budget);
    }

    if result.trim().is_empty() {
        // Fall back to a bounded view of the real output (instead of `on_empty`)
        // when filtering emptied a non-empty output AND either:
        //  - the filter opted in via `passthrough_when_emptied`, or
        //  - the original output carries a generic failure signal that the
        //    per-tool keep/strip rules didn't recognize. Without this guard a
        //    failed build/test/deploy whose error text doesn't match the
        //    tool's native error format would be masked as the success
        //    `on_empty` message — the same "never mask errors" rule the
        //    `match_output.unless` guard enforces.
        let masks_failure = f.on_empty.is_some() && output_has_failure_signal(output);
        if !output.trim().is_empty() && (f.passthrough_when_emptied || masks_failure) {
            let cap = f.max_lines.unwrap_or(40);
            let fallback: Vec<String> = s
                .lines()
                .take(cap)
                .map(|l| match f.truncate_lines_at {
                    Some(n) => truncate_at_char_boundary(l, n).to_string(),
                    None => l.to_string(),
                })
                .collect();
            let fb_text = fallback.join("\n");
            return match f.token_budget {
                Some(budget) => apply_token_budget(&fb_text, budget),
                None => fb_text,
            };
        }
        if let Some(msg) = &f.on_empty {
            return msg.clone();
        }
    }
    result
}

/// Strict detection of an unambiguous command-failure signal in raw output.
/// Used as a safety net so a filter never reports its success `on_empty`
/// message for output that actually describes a failure. Patterns are
/// deliberately anchored/cased to avoid tripping on benign mentions like
/// "0 errors", "no failures", or "error: 0".
fn output_has_failure_signal(output: &str) -> bool {
    use std::sync::OnceLock;
    static FAILURE: OnceLock<Regex> = OnceLock::new();
    let re = FAILURE.get_or_init(|| {
        Regex::new(
            r"(?m)^\s*(?:(?i:error|fatal|panic|panicked|exception|stderr|err)\b|FAILED\b|FAIL\b|---\s*FAIL\b|Traceback \(most recent call last\)|Unhandled exception|Exception in thread\b)|\b(?i:failed with exit code|exited with status|exited with code|exit status|exit code|exit)\b\s*[:=]?\s*[1-9]\d*|\b(?:SIGSEGV|SIGABRT|SIGILL|SIGBUS|AssertionError|NullPointerException|Segmentation fault|(?i:Command failed|command not found|failed to compile))\b|\[(?i:error|fatal|panic|failed|fail)\]|level=(?i:error|fatal|panic)|\b(?i:err)(?:!|:)"
        )
        .expect("failure-signal regex compiles")
    });
    re.is_match(output)
}

fn apply_extract_sections(lines: Vec<String>, sections: &[ExtractSection]) -> Vec<String> {
    let mut result = Vec::new();
    let content = lines.join("\n");

    for section in sections {
        let start_re = match Regex::new(&section.start_pattern) {
            Ok(r) => r,
            Err(_) => continue,
        };
        let end_re = match Regex::new(&section.end_pattern) {
            Ok(r) => r,
            Err(_) => continue,
        };

        let mut matches = 0;
        let max_matches = section.max_matches.unwrap_or(usize::MAX);

        let mut in_section = false;
        let mut section_lines = Vec::new();

        for line in content.lines() {
            let start_match = start_re.is_match(line);
            let end_match = end_re.is_match(line);

            if start_match && !in_section {
                in_section = true;
                if section.include_markers {
                    section_lines.push(line.to_string());
                }
                continue;
            }

            if in_section {
                if section.include_markers || !end_match {
                    section_lines.push(line.to_string());
                }
                if end_match {
                    result.append(&mut section_lines);
                    matches += 1;
                    in_section = false;
                    if matches >= max_matches {
                        break;
                    }
                }
            }
        }

        // Handle unclosed section
        if in_section && section.include_markers {
            result.extend(section_lines);
        }
    }

    if result.is_empty() {
        lines
    } else {
        result
    }
}

fn apply_replace_patterns(lines: Vec<String>, patterns: &[[String; 2]]) -> Vec<String> {
    lines
        .into_iter()
        .map(|mut line| {
            for [pattern, replacement] in patterns {
                if let Ok(re) = Regex::new(pattern) {
                    line = re.replace_all(&line, replacement.as_str()).to_string();
                }
            }
            line
        })
        .collect()
}

fn apply_deduplicate_blocks(lines: Vec<String>, dedup: &DeduplicateBlocksDef) -> Vec<String> {
    let delimiter = dedup.block_delimiter.as_deref().unwrap_or(r"^\s*$");
    let delim_re = match Regex::new(delimiter) {
        Ok(r) => r,
        Err(_) => return lines,
    };

    let mut blocks: Vec<Vec<String>> = Vec::new();
    let mut current_block = Vec::new();

    for line in &lines {
        if delim_re.is_match(line) && !current_block.is_empty() {
            if current_block.len() >= dedup.min_block_lines {
                blocks.push(current_block);
            }
            current_block = Vec::new();
        } else {
            current_block.push(line.clone());
        }
    }
    if !current_block.is_empty() && current_block.len() >= dedup.min_block_lines {
        blocks.push(current_block);
    }

    if blocks.len() < 2 {
        return lines;
    }

    let mut result = Vec::new();
    let mut i = 0;
    while i < blocks.len() {
        let block = &blocks[i];
        result.extend(block.iter().cloned());

        // Check next blocks for similarity
        let mut j = i + 1;
        let mut similar_count = 0;
        while j < blocks.len() {
            if blocks_similar(block, &blocks[j], dedup.similarity) {
                similar_count += 1;
                j += 1;
            } else {
                break;
            }
        }

        if similar_count > 0 {
            result.push(format!(
                "[... {} similar block(s) omitted ...]",
                similar_count
            ));
            i = j;
        } else {
            i += 1;
        }
    }

    result
}

fn blocks_similar(a: &[String], b: &[String], threshold: f32) -> bool {
    if a.len() != b.len() {
        return false;
    }
    let matches = a.iter().zip(b.iter()).filter(|(x, y)| x == y).count();
    (matches as f32 / a.len() as f32) >= threshold
}

fn apply_semantic_filter(lines: Vec<String>, semantic: &SemanticFilterDef) -> Vec<String> {
    // Try to use real embeddings via daemon or in-process
    if let Ok(filtered) = apply_semantic_filter_with_embeddings(&lines, semantic) {
        return filtered;
    }

    // Fallback: keyword-based heuristic
    apply_semantic_filter_keyword_fallback(lines, semantic)
}

fn apply_semantic_filter_with_embeddings(
    lines: &[String],
    semantic: &SemanticFilterDef,
) -> Result<Vec<String>, anyhow::Error> {
    use crate::embed::{embed_query, set_active_model};

    // Set model if specified
    if let Some(model) = &semantic.model {
        if !crate::embed::is_known_model(model) {
            eprintln!(
                "[tokenix] warning: semantic_filter.model '{}' is unknown; falling back to '{}'",
                model,
                crate::embed::DEFAULT_MODEL_ID
            );
        }
        set_active_model(model);
    }

    // Embed the query
    let query_vec = embed_query(&semantic.query)?;

    // Embed each line (or small groups) and compute similarity
    let always_keep_patterns: Vec<Regex> = semantic
        .always_keep
        .iter()
        .filter_map(|p| Regex::new(p).ok())
        .collect();

    let mut results = Vec::new();

    for line in lines {
        // Always keep lines matching always_keep patterns
        if always_keep_patterns.iter().any(|re| re.is_match(line)) {
            results.push(line.clone());
            continue;
        }

        // Skip very short lines
        if line.trim().len() < 5 {
            continue;
        }

        // Embed the line
        let line_vec = embed_query(line)?;

        // Compute cosine similarity
        let similarity = cosine_similarity(&query_vec, &line_vec);

        if similarity >= semantic.threshold {
            results.push(line.clone());
        }
    }

    Ok(results)
}

fn apply_semantic_filter_keyword_fallback(
    lines: Vec<String>,
    semantic: &SemanticFilterDef,
) -> Vec<String> {
    let query_terms: Vec<&str> = semantic.query.split_whitespace().collect();
    let always_keep_patterns: Vec<Regex> = semantic
        .always_keep
        .iter()
        .filter_map(|p| Regex::new(p).ok())
        .collect();

    lines
        .into_iter()
        .filter(|line| {
            if always_keep_patterns.iter().any(|re| re.is_match(line)) {
                return true;
            }
            // Simple keyword overlap as proxy for semantic relevance
            let line_lower = line.to_lowercase();
            query_terms
                .iter()
                .any(|term| line_lower.contains(&term.to_lowercase()))
        })
        .collect()
}

fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
    if a.len() != b.len() || a.is_empty() {
        return 0.0;
    }
    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
    if norm_a == 0.0 || norm_b == 0.0 {
        0.0
    } else {
        dot / (norm_a * norm_b)
    }
}

fn apply_summarize_json(lines: Vec<String>, summarize: &SummarizeJsonDef) -> Vec<String> {
    let content = lines.join("\n");
    let trimmed = content.trim();

    if !(trimmed.starts_with('{') || trimmed.starts_with('[')) {
        return lines;
    }

    let Ok(mut value) = serde_json::from_str::<serde_json::Value>(trimmed) else {
        return lines;
    };

    summarize_json_value(&mut value, summarize, 0);

    let result = serde_json::to_string_pretty(&value).unwrap_or(content);
    result.lines().map(|l| l.to_string()).collect()
}

fn summarize_json_value(value: &mut serde_json::Value, summarize: &SummarizeJsonDef, depth: usize) {
    if depth >= summarize.max_depth {
        return;
    }

    match value {
        serde_json::Value::Object(map) => {
            let keys_to_remove: Vec<String> =
                map.keys()
                    .filter(|k| {
                        let path = if depth == 0 { k.as_str() } else { "" };
                        summarize.exclude.iter().any(|ex| {
                            k.as_str() == ex.as_str() || (depth == 0 && path == ex.as_str())
                        })
                    })
                    .cloned()
                    .collect();
            for k in keys_to_remove {
                map.remove(&k);
            }

            for (k, v) in map.iter_mut() {
                let full_path = if depth == 0 {
                    k.clone()
                } else {
                    format!("{}.{}", depth, k)
                };
                if summarize
                    .always_include
                    .iter()
                    .any(|inc| inc == &full_path || inc == k)
                {
                    continue;
                }
                summarize_json_value(v, summarize, depth + 1);
            }
        }
        serde_json::Value::Array(arr) => {
            if arr.len() > summarize.max_array_items {
                let shown = arr.drain(summarize.max_array_items..).collect::<Vec<_>>();
                let count = shown.len();
                arr.push(serde_json::Value::String(format!(
                    "... {} more item(s) omitted ...",
                    count
                )));
            }
            for item in arr.iter_mut() {
                summarize_json_value(item, summarize, depth + 1);
            }
        }
        _ => {}
    }
}

fn apply_token_budget(text: &str, budget: usize) -> String {
    let tokens = crate::chunker::count_tokens(text);
    if tokens <= budget {
        return text.to_string();
    }

    let lines: Vec<&str> = text.lines().collect();
    if lines.is_empty() {
        return text.to_string();
    }

    // Priority order: errors/warnings > head > tail > middle
    let mut priority_lines = Vec::new();
    let mut other_lines = Vec::new();

    for (i, line) in lines.iter().enumerate() {
        let t = line.trim();
        let is_high_priority = t.starts_with("error")
            || t.starts_with("warning")
            || t.starts_with("FAIL")
            || t.starts_with("panic")
            || t.contains("error[")
            || t.contains("warning[")
            || i < lines.len() / 4
            || i >= lines.len() * 3 / 4;
        if is_high_priority {
            priority_lines.push((i, *line));
        } else {
            other_lines.push((i, *line));
        }
    }

    let mut result = Vec::new();
    let mut used = 0usize;

    for (_, line) in priority_lines {
        let line_tokens = crate::chunker::count_tokens(line);
        if used + line_tokens > budget {
            break;
        }
        result.push(line.to_string());
        used += line_tokens;
    }

    // Fill remaining budget with other lines (prefer head/tail)
    for (_, line) in other_lines {
        let line_tokens = crate::chunker::count_tokens(line);
        if used + line_tokens > budget {
            break;
        }
        result.push(line.to_string());
        used += line_tokens;
    }

    if result.len() < lines.len() {
        result.push(format!(
            "[... {} lines omitted to fit token budget {} ...]",
            lines.len() - result.len(),
            budget
        ));
    }

    result.join("\n")
}

/// Truncate `s` to at most `max_bytes`, backing off to the nearest char
/// boundary so we never slice through a multi-byte UTF-8 sequence (which would
/// panic). Returns a borrowed slice — no allocation.
fn truncate_at_char_boundary(s: &str, max_bytes: usize) -> &str {
    if s.len() <= max_bytes {
        return s;
    }
    let mut end = max_bytes;
    while end > 0 && !s.is_char_boundary(end) {
        end -= 1;
    }
    &s[..end]
}

fn apply_sizing(mut lines: Vec<String>, f: &FilterDef) -> Vec<String> {
    if let Some(head) = f.head_lines {
        lines.truncate(head);
    } else if let Some(tail) = f.tail_lines {
        let len = lines.len();
        if len > tail {
            lines = lines[len - tail..].to_vec();
        }
    } else if let Some(max) = f.max_lines {
        lines.truncate(max);
    }
    lines
}

/// Generate the TOML prompt to send to an AI CLI for filter creation.
pub fn build_filter_prompt(command: &str, sample_output: &str) -> String {
    format!(
        r#"Generate a tokenix TOML filter for the command `{command}`.

TOML filter schema (all fields optional except match_command):
```
[filters.<slug>]
description = "human-readable purpose"
match_command = "^regex_to_match_full_command_line"
strip_ansi = true          # remove ANSI color codes
strip_lines_matching = ["^pattern1", "^pattern2"]  # drop noisy lines
keep_lines_matching = ["error", "warning"]          # keep only signal lines
match_output = [           # short-circuit: if output matches pattern, return message
  {{ pattern = "already installed", message = "ok (already installed)" }},
  # optional `unless`: skip the short-circuit if output also matches it (avoids masking errors)
  {{ pattern = "Build complete!", message = "ok (build complete)", unless = "warning:|error:" }},
]
max_lines = 50             # truncate to N lines
head_lines = 30            # keep first N lines
tail_lines = 10            # keep last N lines
truncate_lines_at = 120    # truncate individual lines at N chars
on_empty = "command: ok"   # message when filter produces empty output

# ADVANCED (extended filtering capabilities):
replace_patterns = [       # regex replacements: [[pattern, replacement], ...]
  ["\\d+\\.\\d+s", "<duration>"],
  ["/home/[^/]+/", "~/"],
]
extract_sections = [       # extract content between markers
  {{ start_pattern = "---- FAILURES ----", end_pattern = "^\\s*$", include_markers = true, max_matches = 3 }},
]
semantic_filter = {{       # embedding-based relevance filtering (uses daemon/embed)
  query = "test failure error panic",
  threshold = 0.3,
  always_keep = ["^error\\[", "^FAIL"],
  model = "nomic-v1.5"
}}
deduplicate_blocks = {{    # structural block deduplication
  min_block_lines = 3,
  similarity = 0.8,
  block_delimiter = "^\\s*$"
}}
summarize_json = {{        # intelligent JSON summarization
  max_array_items = 10,
  max_depth = 3,
  always_include = ["packages", "workspace_members"],
  exclude = ["manifest", "dependencies"]
}}
token_budget = 2000        # hard token limit with smart truncation
```

Rules:
- Use strip_lines_matching to drop boilerplate (progress, verbose info)
- Use keep_lines_matching only if output has a clear signal/noise separation
- Use match_output for commands that succeed silently or with a predictable summary line
- Set on_empty when the command normally succeeds silently
- Use replace_patterns to normalize paths, timestamps, IDs, etc.
- Use extract_sections to pull out failure blocks, error sections, etc.
- Use semantic_filter for query-aware relevance (requires embed model)
- Use deduplicate_blocks for repetitive output (test runs, build steps)
- Use summarize_json for large JSON (cargo metadata, API responses)
- Use token_budget as a hard cap with priority-based truncation
- match_command must be a valid Rust regex matching `{command}` or its typical invocations
- Return ONLY valid TOML, no markdown code fences, no explanations

Sample output from `{command} --help` (or similar):
---
{sample_output}
---

TOML filter:"#
    )
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_load_local_filters() {
        let temp_dir = std::env::current_dir()
            .unwrap()
            .join(".tokenix")
            .join("filters");
        std::fs::create_dir_all(&temp_dir).unwrap();
        let toml_path = temp_dir.join("test_local_cmd.toml");
        std::fs::write(
            &toml_path,
            r#"
[filters.test_local_cmd]
description = "test local"
match_command = "^test_local_cmd$"
on_empty = "empty filter output"
"#,
        )
        .unwrap();

        let local_filters = load_local_filters();
        assert!(!local_filters.is_empty());
        let found = find_filter("test_local_cmd", &local_filters);
        assert!(found.is_some());
        let filter = found.unwrap();
        assert_eq!(filter.on_empty.as_deref(), Some("empty filter output"));

        // Clean up
        let _ = std::fs::remove_file(&toml_path);
        let _ = std::fs::remove_dir_all(
            std::env::current_dir()
                .unwrap()
                .join(".tokenix")
                .join("filters"),
        );
    }

    #[test]
    fn test_tokenize_command() {
        assert_eq!(tokenize_command("cargo test"), vec!["cargo", "test"]);
        assert_eq!(
            tokenize_command("echo \"hello world\""),
            vec!["echo", "hello world"]
        );
        assert_eq!(
            tokenize_command("env CI=true cargo test"),
            vec!["env", "CI=true", "cargo", "test"]
        );
    }

    #[test]
    fn test_unwrap_shell_runner() {
        assert_eq!(
            unwrap_shell_runner("bash -c 'cargo test'"),
            Some("cargo test".to_string())
        );
        assert_eq!(
            unwrap_shell_runner("powershell -Command \"cargo test\""),
            Some("cargo test".to_string())
        );
        assert_eq!(
            unwrap_shell_runner("cmd.exe /c \"cargo test\""),
            Some("cargo test".to_string())
        );
        assert_eq!(unwrap_shell_runner("cargo test"), None);
    }

    #[test]
    fn test_get_effective_command() {
        assert_eq!(
            get_effective_command("cd /app && CI=true cargo test"),
            "cargo test"
        );
        assert_eq!(
            get_effective_command("bash -c 'cd /app && CI=true env cargo test'"),
            "cargo test"
        );
        assert_eq!(
            get_effective_command("env CI=true cargo test"),
            "cargo test"
        );
        // timeout wrapper
        assert_eq!(
            get_effective_command("timeout 180 pnpm run test"),
            "pnpm run test"
        );
        assert_eq!(
            get_effective_command("timeout -k 10 180 pnpm run test"),
            "pnpm run test"
        );
        assert_eq!(
            get_effective_command("timeout --foreground 60s cargo test --quiet"),
            "cargo test --quiet"
        );
        // time wrapper
        assert_eq!(
            get_effective_command("time pnpm run build"),
            "pnpm run build"
        );
        // nice wrapper
        assert_eq!(get_effective_command("nice -n 10 make all"), "make all");
        // stacked: timeout + nice
        assert_eq!(
            get_effective_command("timeout 30 nice -n 5 pnpm run test"),
            "pnpm run test"
        );
        // pnpm / bun workspaces filters
        assert_eq!(
            get_effective_command("pnpm --filter @mika/desktop test"),
            "pnpm test"
        );
        assert_eq!(
            get_effective_command("bun --cwd /app run build"),
            "bun run build"
        );
    }

    #[test]
    fn test_derive_command_candidates() {
        let cmd = "bash -c 'cd /app && cargo test'";
        let candidates = derive_command_candidates(cmd);
        assert!(candidates.contains(&"bash -c 'cd /app && cargo test'".to_string()));
        assert!(candidates.contains(&"cd /app && cargo test".to_string()));
        assert!(candidates.contains(&"cargo test".to_string()));
    }

    #[test]
    fn timeout_wrapper_included_in_candidates() {
        // Reported: `timeout 180 pnpm run test` must produce `pnpm run test`
        // as a candidate so filters keyed on `pnpm run test` match.
        let candidates = derive_command_candidates("timeout 180 pnpm run test");
        assert!(
            candidates.contains(&"pnpm run test".to_string()),
            "candidates: {candidates:?}"
        );
    }

    #[test]
    fn truncate_at_char_boundary_handles_multibyte() {
        // ASCII: exact byte cut
        assert_eq!(truncate_at_char_boundary("hello world", 5), "hello");
        // Shorter than limit: unchanged
        assert_eq!(truncate_at_char_boundary("hi", 10), "hi");
        // Multibyte: 'é' is 2 bytes — cutting at byte 4 lands mid-char, must back off
        let s = "café latte"; // 'é' occupies bytes 3..5
        let out = truncate_at_char_boundary(s, 4);
        assert!(s.starts_with(out));
        assert_eq!(out, "caf"); // backed off to char boundary, no panic
    }

    #[test]
    fn apply_filter_truncate_lines_at_no_panic_on_utf8() {
        let f = FilterDef {
            description: None,
            match_command: ".*".to_string(),
            strip_ansi: false,
            strip_lines_matching: vec![],
            keep_lines_matching: vec![],
            max_lines: None,
            head_lines: None,
            tail_lines: None,
            on_empty: None,
            passthrough_when_emptied: false,
            match_output: vec![],
            truncate_lines_at: Some(4),
            filter_stderr: false,
            replace_patterns: vec![],
            extract_sections: vec![],
            semantic_filter: None,
            deduplicate_blocks: None,
            summarize_json: None,
            token_budget: None,
        };
        // Would panic with naive &l[..4] because 'é'/'ç' straddle the boundary.
        let out = apply_filter("café\nação\n", &f);
        assert_eq!(out, "caf\naç");
    }

    #[test]
    fn semantic_filter_issues_flags_unknown_model_and_bad_threshold() {
        let mut f = FilterDef {
            description: None,
            match_command: ".*".to_string(),
            strip_ansi: false,
            strip_lines_matching: vec![],
            keep_lines_matching: vec![],
            max_lines: None,
            head_lines: None,
            tail_lines: None,
            on_empty: None,
            passthrough_when_emptied: false,
            match_output: vec![],
            truncate_lines_at: None,
            filter_stderr: false,
            replace_patterns: vec![],
            extract_sections: vec![],
            semantic_filter: Some(SemanticFilterDef {
                query: "errors".to_string(),
                threshold: 1.5,
                always_keep: vec![],
                model: Some("does-not-exist".to_string()),
            }),
            deduplicate_blocks: None,
            summarize_json: None,
            token_budget: None,
        };
        let issues = semantic_filter_issues(&f);
        assert_eq!(
            issues.len(),
            2,
            "expected model + threshold issues: {issues:?}"
        );

        let sem = f.semantic_filter.as_mut().unwrap();
        sem.model = Some("nomic-v1.5".to_string());
        sem.threshold = 0.3;
        assert!(semantic_filter_issues(&f).is_empty());
    }

    #[test]
    fn split_on_operators_handles_compound_commands() {
        // Spaced and unspaced operators segment identically.
        assert_eq!(
            split_on_operators("cd foo && gitleaks detect"),
            vec!["cd foo", "gitleaks detect"]
        );
        assert_eq!(
            split_on_operators("cd foo;gitleaks"),
            vec!["cd foo", "gitleaks"]
        );
        assert_eq!(split_on_operators("a || b"), vec!["a", "b"]);
        assert_eq!(
            split_on_operators("producer | gitleaks detect"),
            vec!["producer", "gitleaks detect"]
        );
        // Quoted operators are not split points.
        assert_eq!(
            split_on_operators(r#"echo "a;b" && x"#),
            vec![r#"echo "a;b""#, "x"]
        );
    }

    #[test]
    fn derive_candidates_segments_compound_commands() {
        let candidates = derive_command_candidates("cd foo;gitleaks detect --source .");
        assert!(
            candidates.iter().any(|c| c == "gitleaks detect --source ."),
            "expected a gitleaks segment candidate, got: {candidates:?}"
        );
    }

    #[test]
    fn find_filter_matches_command_after_cd_and_pipe() {
        let f = FilterDef {
            description: None,
            match_command: "^gitleaks\\b".to_string(),
            strip_ansi: false,
            strip_lines_matching: vec![],
            keep_lines_matching: vec![],
            max_lines: None,
            head_lines: None,
            tail_lines: None,
            on_empty: None,
            passthrough_when_emptied: false,
            match_output: vec![],
            truncate_lines_at: None,
            filter_stderr: false,
            replace_patterns: vec![],
            extract_sections: vec![],
            semantic_filter: None,
            deduplicate_blocks: None,
            summarize_json: None,
            token_budget: None,
        };
        let filters = [f];
        // Unspaced semicolon, cd prefix, and a pipe all resolve to the filter.
        assert!(find_filter("cd repo;gitleaks detect", &filters).is_some());
        assert!(find_filter("npm i && gitleaks detect", &filters).is_some());
        assert!(find_filter("cat x | gitleaks detect", &filters).is_some());
        // A bare argument named gitleaks must NOT match (anchored base command).
        assert!(find_filter("echo gitleaks", &filters).is_none());

        // Test pipeline segment prioritization: B takes priority over A in "A | B"
        let f_cat = FilterDef {
            description: None,
            match_command: "^cat\\b".to_string(),
            strip_ansi: false,
            strip_lines_matching: vec![],
            keep_lines_matching: vec![],
            max_lines: None,
            head_lines: None,
            tail_lines: None,
            on_empty: None,
            passthrough_when_emptied: false,
            match_output: vec![],
            truncate_lines_at: None,
            filter_stderr: false,
            replace_patterns: vec![],
            extract_sections: vec![],
            semantic_filter: None,
            deduplicate_blocks: None,
            summarize_json: None,
            token_budget: None,
        };
        let f_gitleaks = FilterDef {
            description: None,
            match_command: "^gitleaks\\b".to_string(),
            strip_ansi: false,
            strip_lines_matching: vec![],
            keep_lines_matching: vec![],
            max_lines: None,
            head_lines: None,
            tail_lines: None,
            on_empty: None,
            passthrough_when_emptied: false,
            match_output: vec![],
            truncate_lines_at: None,
            filter_stderr: false,
            replace_patterns: vec![],
            extract_sections: vec![],
            semantic_filter: None,
            deduplicate_blocks: None,
            summarize_json: None,
            token_budget: None,
        };
        let filters2 = [f_cat, f_gitleaks];
        let matched = find_filter("cat x | gitleaks detect", &filters2).unwrap();
        assert_eq!(matched.match_command, "^gitleaks\\b");
    }

    #[test]
    fn strip_subcommand_global_opts_normalizes_tool_globals() {
        let eff = |c: &str| get_effective_command(c);
        // git global options before the subcommand are peeled away.
        assert_eq!(eff("git -C /repo add ."), "git add .");
        assert_eq!(eff("git -c user.name=x commit -m hi"), "git commit -m hi");
        assert_eq!(eff("git --git-dir=/r/.git -C /r status"), "git status");
        assert_eq!(eff("git --no-pager log --oneline"), "git log --oneline");
        // kubectl / docker / cargo share the same global-option bug class.
        assert_eq!(eff("kubectl -n prod get pods"), "kubectl get pods");
        assert_eq!(eff("docker -H tcp://h ps -a"), "docker ps -a");
        assert_eq!(eff("cargo +nightly test"), "cargo test");
        // Subcommand-less or unknown tools are untouched.
        assert_eq!(eff("git status"), "git status");
        assert_eq!(eff("ls -la"), "ls -la");
        // Trailing valued options do not panic.
        assert_eq!(eff("git -C"), "git");
        assert_eq!(eff("git --git-dir"), "git");
    }

    #[test]
    fn strip_package_runner_exposes_inner_tool() {
        let eff = |c: &str| get_effective_command(c);
        assert_eq!(eff("uv run pytest tests/"), "pytest tests/");
        assert_eq!(eff("python -m ruff check ."), "ruff check .");
        assert_eq!(eff("python3 -m pytest"), "pytest");
        assert_eq!(eff("bunx biome check src"), "biome check src");
        assert_eq!(eff("npx tsc --noEmit"), "tsc --noEmit");
        assert_eq!(eff("pnpm exec eslint ."), "eslint .");
        assert_eq!(eff("pnpm dlx prettier -w ."), "prettier -w .");
        // Bare `pnpm build` is a script, not a runner — left untouched.
        assert_eq!(eff("pnpm build"), "pnpm build");
        // Composes with global-opt stripping: `uv run` then nothing to strip.
        assert_eq!(eff("npx kubectl -n ns get pods"), "kubectl get pods");
    }

    #[test]
    fn find_filter_matches_git_with_global_options() {
        let f = FilterDef {
            description: None,
            match_command: "^git\\s+add\\b".to_string(),
            strip_ansi: false,
            strip_lines_matching: vec![],
            keep_lines_matching: vec![],
            max_lines: None,
            head_lines: None,
            tail_lines: None,
            on_empty: None,
            passthrough_when_emptied: false,
            match_output: vec![],
            truncate_lines_at: None,
            filter_stderr: false,
            replace_patterns: vec![],
            extract_sections: vec![],
            semantic_filter: None,
            deduplicate_blocks: None,
            summarize_json: None,
            token_budget: None,
        };
        let filters = [f];
        assert!(find_filter("git -C /repo add .", &filters).is_some());
        assert!(find_filter("cd x && git -c k=v add -A", &filters).is_some());
    }

    #[test]
    fn apply_filter_match_output_unless_guards_errors() {
        let f = FilterDef {
            description: None,
            match_command: ".*".to_string(),
            strip_ansi: false,
            strip_lines_matching: vec![],
            keep_lines_matching: vec![],
            max_lines: None,
            head_lines: None,
            tail_lines: None,
            on_empty: None,
            passthrough_when_emptied: false,
            match_output: vec![MatchOutput {
                pattern: "total size is".to_string(),
                message: "ok (synced)".to_string(),
                unless: Some("error|failed".to_string()),
            }],
            truncate_lines_at: None,
            filter_stderr: false,
            replace_patterns: vec![],
            extract_sections: vec![],
            semantic_filter: None,
            deduplicate_blocks: None,
            summarize_json: None,
            token_budget: None,
        };
        // Pattern present, no error → short-circuit to message
        assert_eq!(apply_filter("total size is 100\n", &f), "ok (synced)");
        // Pattern present AND error present → unless guard blocks short-circuit
        let out = apply_filter("rsync error\ntotal size is 100\n", &f);
        assert!(out.contains("error"), "error must not be masked: {out:?}");
    }

    /// Minimal `FilterDef` with everything off — scenario tests flip only the
    /// one field under test instead of repeating the full struct literal.
    fn base_filter() -> FilterDef {
        FilterDef {
            description: None,
            match_command: ".*".to_string(),
            strip_ansi: false,
            strip_lines_matching: vec![],
            keep_lines_matching: vec![],
            max_lines: None,
            head_lines: None,
            tail_lines: None,
            on_empty: None,
            passthrough_when_emptied: false,
            match_output: vec![],
            truncate_lines_at: None,
            filter_stderr: false,
            replace_patterns: vec![],
            extract_sections: vec![],
            semantic_filter: None,
            deduplicate_blocks: None,
            summarize_json: None,
            token_budget: None,
        }
    }

    #[test]
    fn apply_filter_strip_ansi_removes_color_codes() {
        let mut f = base_filter();
        f.strip_ansi = true;
        assert_eq!(
            apply_filter("\x1b[31merror\x1b[0m here\n", &f),
            "error here"
        );
    }

    #[test]
    fn apply_filter_strip_lines_matching_drops_noise() {
        let mut f = base_filter();
        f.strip_lines_matching = vec!["^DEBUG".to_string(), "^\\s*$".to_string()];
        let out = apply_filter("DEBUG init\nreal line\n\nDEBUG done\nkeep me\n", &f);
        assert_eq!(out, "real line\nkeep me");
    }

    #[test]
    fn apply_filter_keep_lines_matching_keeps_signal() {
        let mut f = base_filter();
        f.keep_lines_matching = vec!["warn|error".to_string()];
        let out = apply_filter("info: starting\nwarn: low disk\nok\nerror: boom\n", &f);
        assert_eq!(out, "warn: low disk\nerror: boom");
    }

    #[test]
    fn apply_filter_sizing_head_tail_max() {
        let input = "l1\nl2\nl3\nl4\nl5\n";
        let mut head = base_filter();
        head.head_lines = Some(2);
        assert_eq!(apply_filter(input, &head), "l1\nl2");

        let mut tail = base_filter();
        tail.tail_lines = Some(2);
        assert_eq!(apply_filter(input, &tail), "l4\nl5");

        let mut max = base_filter();
        max.max_lines = Some(3);
        assert_eq!(apply_filter(input, &max), "l1\nl2\nl3");

        // head_lines takes precedence over tail/max when both set.
        let mut both = base_filter();
        both.head_lines = Some(1);
        both.tail_lines = Some(2);
        assert_eq!(apply_filter(input, &both), "l1");
    }

    #[test]
    fn apply_filter_replace_patterns_rewrites_lines() {
        let mut f = base_filter();
        f.replace_patterns = vec![
            ["\\d+\\.\\d+s".to_string(), "<dur>".to_string()],
            ["/home/[^/]+/".to_string(), "~/".to_string()],
        ];
        let out = apply_filter("built in 1.23s at /home/bob/app\n", &f);
        assert_eq!(out, "built in <dur> at ~/app");
    }

    #[test]
    fn apply_filter_extract_sections_between_markers() {
        let mut f = base_filter();
        f.extract_sections = vec![ExtractSection {
            start_pattern: "^START".to_string(),
            end_pattern: "^END".to_string(),
            include_markers: false,
            max_matches: None,
        }];
        let out = apply_filter("noise\nSTART\na\nb\nEND\ntrailing\n", &f);
        assert_eq!(out, "a\nb");

        // include_markers keeps the boundary lines.
        f.extract_sections[0].include_markers = true;
        let out2 = apply_filter("noise\nSTART\na\nEND\n", &f);
        assert_eq!(out2, "START\na\nEND");
    }

    #[test]
    fn apply_filter_extract_sections_no_match_returns_original() {
        let mut f = base_filter();
        f.extract_sections = vec![ExtractSection {
            start_pattern: "^NEVER".to_string(),
            end_pattern: "^NOPE".to_string(),
            include_markers: false,
            max_matches: None,
        }];
        // No marker present → falls back to the unmodified content.
        assert_eq!(apply_filter("just\ntwo lines\n", &f), "just\ntwo lines");
    }

    #[test]
    fn apply_filter_deduplicate_blocks_collapses_repeats() {
        let mut f = base_filter();
        f.deduplicate_blocks = Some(DeduplicateBlocksDef {
            min_block_lines: 3,
            similarity: 0.8,
            block_delimiter: None,
        });
        let block = "x\ny\nz";
        let input = format!("{block}\n\n{block}\n\n{block}\n");
        let out = apply_filter(&input, &f);
        assert!(out.starts_with("x\ny\nz"), "first block kept: {out:?}");
        assert!(
            out.contains("2 similar block(s) omitted"),
            "duplicates collapsed: {out:?}"
        );
    }

    #[test]
    fn apply_filter_token_budget_truncates_with_marker() {
        let mut f = base_filter();
        f.token_budget = Some(10);
        let mut input = String::from("error: critical failure\n");
        for i in 0..60 {
            input.push_str(&format!("filler line number {i} with words\n"));
        }
        let out = apply_filter(&input, &f);
        assert!(
            out.contains("omitted to fit token budget"),
            "expected truncation marker: {out:?}"
        );
        assert!(
            out.len() < input.len(),
            "budgeted output must be smaller than input"
        );
    }

    #[test]
    fn apply_filter_on_empty_for_genuinely_empty_output() {
        let mut f = base_filter();
        f.on_empty = Some("cmd: ok".to_string());
        // Truly empty / whitespace-only command output → success sentinel.
        assert_eq!(apply_filter("", &f), "cmd: ok");
        assert_eq!(apply_filter("   \n\t\n", &f), "cmd: ok");
    }

    #[test]
    fn apply_filter_benign_emptied_output_keeps_on_empty() {
        // keep rules strip everything, output is benign (no failure signal):
        // the success `on_empty` must still fire — guard must not over-trigger.
        let mut f = base_filter();
        f.keep_lines_matching = vec!["^KEEP".to_string()];
        f.on_empty = Some("cmd: ok".to_string());
        assert_eq!(
            apply_filter("benign output line\nnothing notable here\n", &f),
            "cmd: ok"
        );
    }

    #[test]
    fn apply_filter_failure_signal_overrides_on_empty() {
        // Same filter, but the output is a failure whose format the keep rule
        // doesn't recognize → must passthrough the error, never "cmd: ok".
        let mut f = base_filter();
        f.keep_lines_matching = vec!["^KEEP".to_string()];
        f.on_empty = Some("cmd: ok".to_string());
        let out = apply_filter("ERROR: boom\nprocess exited with exit code 1\n", &f);
        assert_ne!(out, "cmd: ok", "failure must not be masked");
        assert!(out.contains("ERROR: boom"), "real error surfaced: {out:?}");
    }

    #[test]
    fn apply_filter_passthrough_when_emptied_opt_in() {
        // Opt-in passthrough surfaces real output even without a failure signal.
        let mut f = base_filter();
        f.keep_lines_matching = vec!["^KEEP".to_string()];
        f.on_empty = Some("no changes".to_string());
        f.passthrough_when_emptied = true;
        let out = apply_filter("abc123 some commit subject\n", &f);
        assert_ne!(out, "no changes");
        assert!(out.contains("abc123"), "real output shown: {out:?}");
    }

    #[test]
    fn apply_filter_match_output_short_circuits_without_unless() {
        let mut f = base_filter();
        f.match_output = vec![MatchOutput {
            pattern: "BUILD SUCCESSFUL".to_string(),
            message: "ok".to_string(),
            unless: None,
        }];
        // Short-circuits before any line filtering / sizing.
        assert_eq!(
            apply_filter("noise\nBUILD SUCCESSFUL in 2s\nmore noise\n", &f),
            "ok"
        );
    }

    #[test]
    fn find_filter_picks_longest_matching_pattern() {
        let mut broad = base_filter();
        broad.match_command = "^git\\b".to_string();
        let mut specific = base_filter();
        specific.match_command = "^git\\s+status\\b".to_string();
        let filters = [broad, specific];
        let hit = find_filter("git status -s", &filters).unwrap();
        assert_eq!(
            hit.match_command, "^git\\s+status\\b",
            "most specific (longest) pattern wins"
        );
    }

    // --- Golden self-test: run every bundled filter's embedded [[tests.<name>]]
    // cases through the real apply_filter pipeline. Homologation guard so the
    // ~150 declared input→expected pairs can never silently drift.
    #[derive(Debug, Deserialize)]
    struct GoldenCase {
        #[serde(default)]
        name: Option<String>,
        #[serde(default)]
        command: Option<String>,
        input: String,
        expected: String,
    }

    #[derive(Debug, Deserialize)]
    struct FilterTestFile {
        #[serde(default)]
        filters: HashMap<String, FilterDef>,
        #[serde(default)]
        tests: HashMap<String, Vec<GoldenCase>>,
    }

    #[test]
    fn verbose_real_output_compresses_at_least_70pct() {
        // The corpus golden inputs are tiny fixtures; the headline 55% figure is
        // diluted by them. On the real use case — verbose, noisy command output —
        // the bundled filters must deliver heavy compression. Each pair is routed
        // through the real `find_filter` + `apply_filter` path, exactly like the
        // hook. Uses many command VARIANTS to also exercise filter resolution.
        use crate::chunker::count_tokens;
        let bundled = load_bundled_filters();

        // Realistic, verbose success output for common noisy commands. A clean
        // run is mostly progress/compile noise that collapses to a sentinel.
        let compiling = (0..30)
            .map(|i| format!("   Compiling crate_{i} v0.{i}.0"))
            .collect::<Vec<_>>()
            .join("\n");
        let cargo_build_out = format!(
            "    Updating crates.io index\n{compiling}\n    Finished dev [unoptimized + debuginfo] target(s) in 18.4s\n"
        );
        let cargo_test_out = format!(
            "{compiling}\n     Running unittests src/lib.rs\nrunning 42 tests\n{dots}\ntest result: ok. 42 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.31s\n",
            dots = (0..42)
                .map(|i| format!("test module::case_{i} ... ok"))
                .collect::<Vec<_>>()
                .join("\n")
        );
        let npm_out = {
            let mut s = String::new();
            for i in 0..25 {
                s.push_str(&format!(
                    "npm WARN deprecated pkg_{i}@1.0.0: use pkg_{i}@2\n"
                ));
            }
            s.push_str("added 642 packages, and audited 643 packages in 12s\n");
            s.push_str("found 0 vulnerabilities\n");
            s
        };
        let pip_out = {
            let mut s = String::new();
            for i in 0..20 {
                s.push_str(&format!(
                    "Requirement already satisfied: dep_{i} in ./venv\n"
                ));
            }
            s.push_str("Successfully installed app-1.0.0\n");
            s
        };
        let pytest_out = format!(
            "============================= test session starts ==============================\nplatform linux -- Python 3.12.0, pytest-8.0.0\ncollected 88 items\n\n{}\n\n============================== 88 passed in 4.21s ==============================\n",
            (0..6).map(|_| "tests/test_x.py ................................").collect::<Vec<_>>().join("\n")
        );
        let docker_out = {
            let mut s = String::new();
            for i in 1..25 {
                s.push_str(&format!(
                    "#{i} [stage {i}/24] RUN build step {i}\n#{i} DONE 0.{i}s\n"
                ));
            }
            s.push_str("#25 exporting to image\n#25 naming to docker.io/library/app:latest DONE\n");
            s
        };
        let eslint_out = "No problems found in 240 files.\n".to_string();
        let git_pull_out = "remote: Enumerating objects: 1200, done.\nremote: Counting objects: 100% (1200/1200), done.\nremote: Compressing objects: 100% (600/600), done.\nremote: Total 1200 (delta 800), reused 1100 (delta 700)\nUnpacking objects: 100% (1200/1200), 2.40 MiB | 4.80 MiB/s, done.\nFrom github.com:org/repo\n   abc1234..def5678  main       -> origin/main\nUpdating abc1234..def5678\nFast-forward\n".to_string();

        let cases: Vec<(&str, &str)> = vec![
            ("cargo build --release", &cargo_build_out),
            ("timeout 600 cargo test --all", &cargo_test_out),
            ("env CI=1 npm install --no-fund", &npm_out),
            ("pip install -r requirements.txt", &pip_out),
            ("python -m pytest -q", &pytest_out),
            ("docker build -t app:latest .", &docker_out),
            ("npx eslint src/", &eslint_out),
            ("git pull --rebase origin main", &git_pull_out),
        ];

        let mut in_total = 0usize;
        let mut out_total = 0usize;
        let mut report: Vec<String> = Vec::new();
        for (cmd, sample) in &cases {
            let f = find_filter(cmd, &bundled)
                .unwrap_or_else(|| panic!("no bundled filter resolved for {cmd:?}"));
            let got = apply_filter(sample, f);
            let it = count_tokens(sample);
            let ot = count_tokens(&got);
            let pct = (it.saturating_sub(ot) as f64 / it as f64) * 100.0;
            report.push(format!("{cmd}: {it}->{ot} ({pct:.0}%)"));
            in_total += it;
            out_total += ot;
        }
        let agg = (in_total.saturating_sub(out_total) as f64 / in_total as f64) * 100.0;
        eprintln!(
            "verbose-economy: {in_total}->{out_total} tokens, {agg:.1}% saved\n  {}",
            report.join("\n  ")
        );
        assert!(
            agg >= 70.0,
            "verbose real output must compress >=70%, got {agg:.1}%\n{}",
            report.join("\n")
        );
    }

    #[test]
    fn match_command_resolves_many_invocation_variants() {
        // Homologation: a filter must survive the many shapes a command arrives
        // in — wrappers (timeout/env/nice), launchers (npx/uv run/python -m),
        // shell `-c`, tool global options, cd-prefixes, pipes and `&&` chains.
        let bundled = load_bundled_filters();
        let variants: &[(&str, &[&str])] = &[
            (
                "cargo build",
                &[
                    "cargo build",
                    "cargo build --release",
                    "timeout 300 cargo build",
                    "env RUSTFLAGS=-W cargo build -j 8",
                    "cargo +nightly build",
                    "cd crates/app && cargo build",
                    "bash -c 'cargo build --workspace'",
                    "nice -n 10 cargo build",
                ],
            ),
            (
                "pytest",
                &[
                    "pytest",
                    "pytest -q tests/",
                    "python -m pytest",
                    "python3 -m pytest tests/unit",
                    "uv run pytest -x",
                    "cd backend && pytest",
                ],
            ),
            (
                "eslint",
                &[
                    "eslint .",
                    "npx eslint src/ --fix",
                    "cd web && npx eslint .",
                ],
            ),
            (
                "kubectl get",
                &[
                    "kubectl get pods",
                    "kubectl -n prod get pods -o wide",
                    "kubectl --context staging get deploy",
                ],
            ),
            (
                "git status",
                &[
                    "git status",
                    "git status -s",
                    "git -C /repo status",
                    "git -c color.ui=always status",
                    "cd repo && git status",
                ],
            ),
            (
                "docker build",
                &[
                    "docker build -t x .",
                    "docker buildx build --platform linux/amd64 .",
                    "timeout 900 docker build .",
                ],
            ),
            (
                "npm install",
                &[
                    "npm install",
                    "npm i",
                    "npm install --save-dev typescript",
                    "cat .npmrc && npm install",
                ],
            ),
            // Newly added filters — verify their variants resolve too.
            (
                "npm ci",
                &["npm ci", "timeout 300 npm ci", "cd web && npm ci"],
            ),
            (
                "cargo bench",
                &[
                    "cargo bench",
                    "cargo bench --bench parse",
                    "cargo +nightly bench",
                ],
            ),
            ("cargo update", &["cargo update", "cargo update -p serde"]),
            (
                "pip list",
                &[
                    "pip list",
                    "pip freeze",
                    "python -m pip list",
                    "pip3 freeze",
                ],
            ),
            (
                "git cherry-pick",
                &[
                    "git cherry-pick abc123",
                    "git -C /repo cherry-pick --continue",
                ],
            ),
            (
                "dotnet run",
                &[
                    "dotnet run",
                    "dotnet run --project app",
                    "cd svc && dotnet run",
                ],
            ),
            (
                "prisma",
                &[
                    "prisma generate",
                    "npx prisma migrate dev",
                    "pnpm prisma db push",
                    "yarn prisma studio",
                    "bunx prisma generate",
                ],
            ),
            (
                "wrangler",
                &[
                    "wrangler deploy",
                    "npx wrangler deploy",
                    "pnpm wrangler pages deploy dist",
                    "yarn wrangler tail",
                ],
            ),
        ];
        let mut misses: Vec<String> = Vec::new();
        for (label, cmds) in variants {
            for cmd in *cmds {
                if find_filter(cmd, &bundled).is_none() {
                    misses.push(format!("[{label}] no filter for variant: {cmd:?}"));
                }
            }
        }
        assert!(
            misses.is_empty(),
            "{} command variant(s) failed to resolve a filter:\n{}",
            misses.len(),
            misses.join("\n")
        );
    }

    #[test]
    fn bundled_filters_pass_embedded_golden_tests() {
        let mut total = 0usize;
        let mut files_with_tests = 0usize;
        let mut failures: Vec<String> = Vec::new();

        for asset in BundledFilters::iter() {
            let file = BundledFilters::get(&asset).expect("bundled asset readable");
            let content = std::str::from_utf8(file.data.as_ref()).expect("filter is utf8");
            let parsed: FilterTestFile = match toml::from_str(content) {
                Ok(p) => p,
                Err(e) => {
                    failures.push(format!("{asset}: TOML parse error: {e}"));
                    continue;
                }
            };
            if !parsed.tests.is_empty() {
                files_with_tests += 1;
            }
            for (fname, cases) in &parsed.tests {
                let Some(fdef) = parsed.filters.get(fname) else {
                    failures.push(format!(
                        "{asset}: [[tests.{fname}]] references undefined [filters.{fname}]"
                    ));
                    continue;
                };
                for (i, case) in cases.iter().enumerate() {
                    total += 1;
                    if let Some(command) = case.command.as_deref() {
                        let re = Regex::new(&fdef.match_command).unwrap_or_else(|e| {
                            panic!("{asset} [{fname}] invalid match_command: {e}")
                        });
                        assert!(
                            re.is_match(command),
                            "{asset} [{fname}] expected match_command to match {:?}",
                            command
                        );
                    }
                    let got = apply_filter(&case.input, fdef);
                    if got.trim_end() != case.expected.trim_end() {
                        let label = case.name.clone().unwrap_or_else(|| format!("#{i}"));
                        failures.push(format!(
                            "{asset} [{fname} / {label}]\n  expected: {:?}\n  got:      {:?}",
                            case.expected, got
                        ));
                    }
                }
            }
        }

        eprintln!(
            "golden: ran {total} embedded cases across {files_with_tests} bundled filter files"
        );
        assert!(
            failures.is_empty(),
            "{} bundled golden filter case(s) failed:\n\n{}",
            failures.len(),
            failures.join("\n\n")
        );
    }

    #[test]
    fn bundled_filters_require_minimum_tests() {
        let mut failures = Vec::new();
        for asset in BundledFilters::iter() {
            let file = BundledFilters::get(&asset).expect("bundled asset readable");
            let content = std::str::from_utf8(file.data.as_ref()).expect("filter is utf8");
            let parsed: FilterTestFile = match toml::from_str(content) {
                Ok(p) => p,
                Err(e) => {
                    failures.push(format!("{asset}: TOML parse error: {e}"));
                    continue;
                }
            };
            for fname in parsed.filters.keys() {
                match parsed.tests.get(fname) {
                    Some(cases) => {
                        if cases.len() < 2 {
                            failures.push(format!(
                                "{asset}: [filters.{fname}] has only {} test case(s), expected at least 2",
                                cases.len()
                            ));
                        }
                    }
                    None => {
                        failures.push(format!(
                            "{asset}: [filters.{fname}] has NO test cases defined"
                        ));
                    }
                }
            }
        }
        assert!(
            failures.is_empty(),
            "The following bundled filter(s) do not meet the minimum test requirement (>=2 golden cases):\n\n{}",
            failures.join("\n")
        );
    }

    /// Iterate every bundled filter's embedded golden cases, applying the real
    /// pipeline. Yields `(asset, filter_name, input, filtered_output)`.
    fn for_each_golden_case<F: FnMut(&str, &str, &str, &str)>(mut visit: F) {
        for asset in BundledFilters::iter() {
            let file = BundledFilters::get(&asset).expect("bundled asset readable");
            let content = std::str::from_utf8(file.data.as_ref()).expect("utf8");
            let parsed: FilterTestFile = match toml::from_str(content) {
                Ok(p) => p,
                Err(_) => continue,
            };
            for (fname, cases) in &parsed.tests {
                let Some(fdef) = parsed.filters.get(fname) else {
                    continue;
                };
                for case in cases {
                    let got = apply_filter(&case.input, fdef);
                    visit(&asset, fname, &case.input, &got);
                }
            }
        }
    }

    #[test]
    #[ignore]
    fn diag_per_filter_compression() {
        let mut rows: Vec<(String, usize, usize, f64)> = Vec::new();
        for_each_golden_case_grouped(|fname, it, ot| {
            let pct = if it > 0 {
                (it.saturating_sub(ot) as f64 / it as f64) * 100.0
            } else {
                0.0
            };
            rows.push((fname.to_string(), it, ot, pct));
        });
        // Lowest %-saved first; ties broken by biggest input (most waste left).
        rows.sort_by(|a, b| a.3.partial_cmp(&b.3).unwrap().then(b.1.cmp(&a.1)));
        let mut out = String::from("FILTER,IN_TOK,OUT_TOK,PCT_SAVED\n");
        for (n, it, ot, pct) in &rows {
            out.push_str(&format!("{n},{it},{ot},{pct:.0}\n"));
        }
        let path = std::env::temp_dir().join("tokenix_diag_compression.csv");
        std::fs::write(&path, out).unwrap();
        eprintln!("wrote {}", path.display());
    }

    /// Like `for_each_golden_case` but aggregates per filter: `(name, in_tok, out_tok)`.
    fn for_each_golden_case_grouped<F: FnMut(&str, usize, usize)>(mut visit: F) {
        use crate::chunker::count_tokens;
        for asset in BundledFilters::iter() {
            let file = BundledFilters::get(&asset).unwrap();
            let content = std::str::from_utf8(file.data.as_ref()).unwrap();
            let parsed: FilterTestFile = match toml::from_str(content) {
                Ok(p) => p,
                Err(_) => continue,
            };
            for (fname, cases) in &parsed.tests {
                let Some(fdef) = parsed.filters.get(fname) else {
                    continue;
                };
                let mut it = 0usize;
                let mut ot = 0usize;
                for case in cases {
                    it += count_tokens(&case.input);
                    ot += count_tokens(&apply_filter(&case.input, fdef));
                }
                visit(fname, it, ot);
            }
        }
    }

    #[test]
    fn filters_never_inflate_output_tokens() {
        // Economy invariant: a filter must never produce MORE tokens than it was
        // given (a tiny slack covers short inputs replaced by a sentinel message
        // like "cmd: ok"). A filter that inflates output is a net token loss.
        use crate::chunker::count_tokens;
        const SLACK_TOKENS: usize = 8;
        let mut offenders: Vec<String> = Vec::new();
        for_each_golden_case(|asset, fname, input, got| {
            let in_tok = count_tokens(input);
            let out_tok = count_tokens(got);
            if out_tok > in_tok + SLACK_TOKENS {
                offenders.push(format!(
                    "{asset} [{fname}]: {in_tok} -> {out_tok} tokens (inflated)"
                ));
            }
        });
        assert!(
            offenders.is_empty(),
            "{} filter case(s) inflated output beyond slack:\n{}",
            offenders.len(),
            offenders.join("\n")
        );
    }

    #[test]
    fn filters_deliver_aggregate_token_savings() {
        // The whole point of the tool: across the bundled corpus' realistic
        // sample inputs, filtering must cut a large share of tokens. Guards
        // against a regression that quietly neuters compression (e.g. a broken
        // strip/keep stage) while individual golden equality still passes.
        use crate::chunker::count_tokens;
        let mut in_total = 0usize;
        let mut out_total = 0usize;
        let mut cases = 0usize;
        for_each_golden_case(|_, _, input, got| {
            in_total += count_tokens(input);
            out_total += count_tokens(got);
            cases += 1;
        });
        assert!(cases > 100, "expected the full corpus, saw {cases} cases");
        let saved = in_total.saturating_sub(out_total);
        let pct = (saved as f64 / in_total as f64) * 100.0;
        eprintln!("economy: {cases} cases, {in_total} -> {out_total} tokens, {pct:.1}% saved");
        assert!(out_total < in_total, "corpus must shrink, not grow");
        assert!(
            pct >= 40.0,
            "expected >=40% aggregate token savings across the corpus, got {pct:.1}%"
        );
    }

    #[test]
    fn output_has_failure_signal_strict() {
        // Positives: real failure output.
        assert!(output_has_failure_signal(
            "fatal: something went wrong\nERROR: build failed with exit code 1"
        ));
        assert!(output_has_failure_signal(
            "panic: runtime error: index out of range"
        ));
        assert!(output_has_failure_signal(
            "FAILED tests/test_foo.py::test_bar - AssertionError"
        ));
        assert!(output_has_failure_signal(
            "Traceback (most recent call last):\n  File \"x.py\""
        ));
        assert!(output_has_failure_signal("error: aborting due to 1 error"));

        // New Positives:
        assert!(output_has_failure_signal(
            "[ERROR] database connection failed"
        ));
        assert!(output_has_failure_signal(
            "2026-06-22T12:00:00Z [error] database down"
        ));
        assert!(output_has_failure_signal("npm ERR! code ELIFECYCLE"));
        assert!(output_has_failure_signal("yarn ERR: error Command failed"));
        assert!(output_has_failure_signal(
            "--- FAIL: TestExploreCodebase (0.05s)"
        ));
        assert!(output_has_failure_signal(
            "Segmentation fault (core dumped)"
        ));
        assert!(output_has_failure_signal("Process received signal SIGSEGV"));
        assert!(output_has_failure_signal(
            "java.lang.NullPointerException: object is null"
        ));
        assert!(output_has_failure_signal("exited with status: 1"));
        assert!(output_has_failure_signal("exit status 127"));
        assert!(output_has_failure_signal("Command failed with exit code 2"));
        assert!(output_has_failure_signal(
            "time=\"xxx\" level=error msg=\"db lost\""
        ));

        // Negatives: benign success summaries that merely mention error/fail.
        assert!(!output_has_failure_signal("test result: ok. 0 failed"));
        assert!(!output_has_failure_signal("0 errors, 0 warnings"));
        assert!(!output_has_failure_signal("no errors found"));
        assert!(!output_has_failure_signal("Compiling: 0 failures"));
        assert!(!output_has_failure_signal("exit status 0"));
        assert!(!output_has_failure_signal("exited with status: 0"));
        assert!(!output_has_failure_signal("warnings: 12"));
    }

    #[test]
    fn bundled_filters_never_mask_generic_failure() {
        // Homologation guard: a generic command failure must never be reduced
        // to a filter's success `on_empty` message. Feeds each bundled filter
        // an unambiguous failure payload and asserts a failure marker survives.
        let payload = "fatal: the operation failed\nERROR: process exited with exit code 1\nFAILED";
        let survives = Regex::new(r"(?i)error|fail|fatal").unwrap();
        let mut masked: Vec<String> = Vec::new();
        for asset in BundledFilters::iter() {
            let file = BundledFilters::get(&asset).unwrap();
            let content = std::str::from_utf8(file.data.as_ref()).unwrap();
            let parsed: FilterFile = match toml::from_str(content) {
                Ok(p) => p,
                Err(_) => continue,
            };
            for (name, fdef) in &parsed.filters {
                let got = apply_filter(payload, fdef);
                if !survives.is_match(&got) {
                    masked.push(format!("{asset} [{name}] -> {:?}", got));
                }
            }
        }
        assert!(
            masked.is_empty(),
            "{} bundled filter(s) masked a generic failure as success:\n{}",
            masked.len(),
            masked.join("\n")
        );
    }

    #[test]
    fn test_gradlew_match() {
        let filters = load_bundled_filters();
        let f = find_filter("./gradlew", &filters);
        assert!(f.is_some(), "gradlew filter must be found for './gradlew'");
        assert_eq!(f.unwrap().on_empty.as_deref(), Some("gradlew: success"));
    }

    #[test]
    fn find_filter_parameter_scenarios() {
        let filters = [
            FilterDef {
                description: None,
                match_command: "^cargo\\s+test\\b".to_string(),
                strip_ansi: false,
                strip_lines_matching: vec![],
                keep_lines_matching: vec![],
                max_lines: None,
                head_lines: None,
                tail_lines: None,
                on_empty: Some("cargo-test".to_string()),
                passthrough_when_emptied: false,
                match_output: vec![],
                truncate_lines_at: None,
                filter_stderr: false,
                replace_patterns: vec![],
                extract_sections: vec![],
                semantic_filter: None,
                deduplicate_blocks: None,
                summarize_json: None,
                token_budget: None,
            },
            FilterDef {
                description: None,
                match_command: "^docker\\s+build\\b".to_string(),
                strip_ansi: false,
                strip_lines_matching: vec![],
                keep_lines_matching: vec![],
                max_lines: None,
                head_lines: None,
                tail_lines: None,
                on_empty: Some("docker-build".to_string()),
                passthrough_when_emptied: false,
                match_output: vec![],
                truncate_lines_at: None,
                filter_stderr: false,
                replace_patterns: vec![],
                extract_sections: vec![],
                semantic_filter: None,
                deduplicate_blocks: None,
                summarize_json: None,
                token_budget: None,
            },
            FilterDef {
                description: None,
                match_command: "^git\\s+diff\\b".to_string(),
                strip_ansi: false,
                strip_lines_matching: vec![],
                keep_lines_matching: vec![],
                max_lines: None,
                head_lines: None,
                tail_lines: None,
                on_empty: Some("git-diff".to_string()),
                passthrough_when_emptied: false,
                match_output: vec![],
                truncate_lines_at: None,
                filter_stderr: false,
                replace_patterns: vec![],
                extract_sections: vec![],
                semantic_filter: None,
                deduplicate_blocks: None,
                summarize_json: None,
                token_budget: None,
            },
            FilterDef {
                description: None,
                match_command: "^git\\s+log\\b".to_string(),
                strip_ansi: false,
                strip_lines_matching: vec![],
                keep_lines_matching: vec![],
                max_lines: None,
                head_lines: None,
                tail_lines: None,
                on_empty: Some("git-log".to_string()),
                passthrough_when_emptied: false,
                match_output: vec![],
                truncate_lines_at: None,
                filter_stderr: false,
                replace_patterns: vec![],
                extract_sections: vec![],
                semantic_filter: None,
                deduplicate_blocks: None,
                summarize_json: None,
                token_budget: None,
            },
            FilterDef {
                description: None,
                match_command: "^kubectl\\s+get\\b".to_string(),
                strip_ansi: false,
                strip_lines_matching: vec![],
                keep_lines_matching: vec![],
                max_lines: None,
                head_lines: None,
                tail_lines: None,
                on_empty: Some("kubectl-get".to_string()),
                passthrough_when_emptied: false,
                match_output: vec![],
                truncate_lines_at: None,
                filter_stderr: false,
                replace_patterns: vec![],
                extract_sections: vec![],
                semantic_filter: None,
                deduplicate_blocks: None,
                summarize_json: Some(SummarizeJsonDef {
                    max_array_items: 10,
                    max_depth: 3,
                    always_include: vec![],
                    exclude: vec![],
                }),
                token_budget: None,
            },
            FilterDef {
                description: None,
                match_command: "^pytest\\b".to_string(),
                strip_ansi: false,
                strip_lines_matching: vec![],
                keep_lines_matching: vec![],
                max_lines: None,
                head_lines: None,
                tail_lines: None,
                on_empty: Some("pytest".to_string()),
                passthrough_when_emptied: false,
                match_output: vec![],
                truncate_lines_at: None,
                filter_stderr: false,
                replace_patterns: vec![],
                extract_sections: vec![],
                semantic_filter: None,
                deduplicate_blocks: None,
                summarize_json: None,
                token_budget: None,
            },
            FilterDef {
                description: None,
                match_command: "^git\\b".to_string(),
                strip_ansi: false,
                strip_lines_matching: vec![],
                keep_lines_matching: vec![],
                max_lines: None,
                head_lines: None,
                tail_lines: None,
                on_empty: Some("git-broad".to_string()),
                passthrough_when_emptied: false,
                match_output: vec![],
                truncate_lines_at: None,
                filter_stderr: false,
                replace_patterns: vec![],
                extract_sections: vec![],
                semantic_filter: None,
                deduplicate_blocks: None,
                summarize_json: None,
                token_budget: None,
            },
            FilterDef {
                description: None,
                match_command: "^git\\s+branch\\b".to_string(),
                strip_ansi: false,
                strip_lines_matching: vec![],
                keep_lines_matching: vec![],
                max_lines: None,
                head_lines: None,
                tail_lines: None,
                on_empty: Some("git-branch".to_string()),
                passthrough_when_emptied: false,
                match_output: vec![],
                truncate_lines_at: None,
                filter_stderr: false,
                replace_patterns: vec![],
                extract_sections: vec![],
                semantic_filter: None,
                deduplicate_blocks: None,
                summarize_json: None,
                token_budget: None,
            },
            FilterDef {
                description: None,
                match_command: "^eslint\\b".to_string(),
                strip_ansi: false,
                strip_lines_matching: vec![],
                keep_lines_matching: vec![],
                max_lines: None,
                head_lines: None,
                tail_lines: None,
                on_empty: Some("eslint".to_string()),
                passthrough_when_emptied: false,
                match_output: vec![],
                truncate_lines_at: None,
                filter_stderr: false,
                replace_patterns: vec![],
                extract_sections: vec![],
                semantic_filter: None,
                deduplicate_blocks: None,
                summarize_json: None,
                token_budget: None,
            },
        ];

        let run =
            |cmd: &str| find_filter(cmd, &filters).map(|f| f.on_empty.as_ref().unwrap().as_str());

        // 120+ data-driven test cases validating parameters, wrappers, shell features, and bypass flags
        let static_cases = vec![
            // 1. Tool-specific basic matches
            ("cargo test", Some("cargo-test")),
            ("docker build", Some("docker-build")),
            ("git diff", Some("git-diff")),
            ("git log", Some("git-log")),
            ("kubectl get", Some("kubectl-get")),
            ("pytest", Some("pytest")),
            ("eslint", Some("eslint")),
            ("git branch", Some("git-branch")),
            // 2. Standard parameters & flags
            (
                "cargo test --workspace --all-features --jobs 4",
                Some("cargo-test"),
            ),
            ("cargo test -p my-package --lib", Some("cargo-test")),
            (
                "docker build -t image:latest -f Dockerfile .",
                Some("docker-build"),
            ),
            (
                "docker build --build-arg KEY=VAL --no-cache .",
                Some("docker-build"),
            ),
            (
                "git diff HEAD~1 HEAD --stat --compact-summary",
                Some("git-diff"),
            ),
            ("git diff main..feature --name-status", Some("git-diff")),
            (
                "git log -n 50 --oneline --graph --decorate",
                Some("git-log"),
            ),
            (
                "git log --author=\"Bob\" --since=\"1 week ago\"",
                Some("git-log"),
            ),
            (
                "kubectl get pods -n kube-system -o wide",
                Some("kubectl-get"),
            ),
            (
                "kubectl get services,deployments -l app=nginx",
                Some("kubectl-get"),
            ),
            (
                "pytest tests/test_auth.py -k \"login_successful\"",
                Some("pytest"),
            ),
            (
                "pytest -v --tb=short --cov=src --cov-report=html",
                Some("pytest"),
            ),
            ("eslint src/ --ext .ts,.tsx --fix", Some("eslint")),
            (
                "eslint --cache --resolve-plugins-relative-to .",
                Some("eslint"),
            ),
            // 3. Env vars & wrappers
            ("CI=true cargo test", Some("cargo-test")),
            ("NODE_ENV=test PORT=3000 pytest", Some("pytest")),
            ("cross-env CI=true pnpm exec eslint", Some("eslint")),
            ("time cargo test", Some("cargo-test")),
            ("nice cargo test", Some("cargo-test")),
            ("nice -n 10 cargo test", Some("cargo-test")),
            ("timeout 30s pytest", Some("pytest")),
            ("timeout --foreground 60s pytest", Some("pytest")),
            ("timeout -k 5 10 nice -n 19 pytest", Some("pytest")),
            (
                "CI=true timeout 30 nice -n 5 cargo test --quiet",
                Some("cargo-test"),
            ),
            // 4. Directory prefixes (cd / pushd)
            ("cd app && cargo test", Some("cargo-test")),
            ("cd app; cargo test", Some("cargo-test")),
            ("cd app || exit 1; cargo test", Some("cargo-test")),
            ("pushd app && pytest", Some("pytest")),
            ("cd /d C:\\Project && docker build .", Some("docker-build")),
            ("cd src && ENV=1 timeout 10 pytest -v", Some("pytest")),
            // 5. Global tool options with subcommand parameters
            ("git -C /src diff", Some("git-diff")),
            ("git --git-dir=/src/.git diff", Some("git-diff")),
            ("git -c core.autocrlf=input diff", Some("git-diff")),
            ("git --no-pager diff", Some("git-diff")),
            (
                "git -C /src -c k=v --no-pager diff --stat",
                Some("git-diff"),
            ),
            ("docker -H tcp://1.2.3.4:2376 build .", Some("docker-build")),
            ("docker --context default build .", Some("docker-build")),
            (
                "kubectl --kubeconfig=~/.kube/config get pods",
                Some("kubectl-get"),
            ),
            (
                "kubectl -n default --context=dev get pods",
                Some("kubectl-get"),
            ),
            // 6. Package runners
            ("npx eslint", Some("eslint")),
            ("npx --no-install eslint .", Some("eslint")),
            ("pnpm exec eslint", Some("eslint")),
            ("pnpm dlx eslint", Some("eslint")),
            ("bunx eslint", Some("eslint")),
            ("bun x eslint", Some("eslint")),
            ("yarn dlx eslint", Some("eslint")),
            ("uv run pytest", Some("pytest")),
            ("uvx pytest", Some("pytest")),
            ("python -m pytest", Some("pytest")),
            ("python3 -m pytest", Some("pytest")),
            ("python -m ruff check", None), // no ruff filter registered
            // 7. Shell runners
            ("bash -c \"cargo test\"", Some("cargo-test")),
            ("sh -c \"pytest\"", Some("pytest")),
            ("cmd.exe /c \"cargo test\"", Some("cargo-test")),
            ("powershell -Command \"cargo test\"", Some("cargo-test")),
            ("pwsh -Command \"pytest\"", Some("pytest")),
            ("& 'cargo test'", Some("cargo-test")),
            // 8. Help flag bypass variants
            ("cargo test --help", None),
            ("cargo test -h", None),
            ("cargo test help", None),
            ("cargo test /h", None),
            ("cargo test /?", None),
            ("cargo test --help-all", None),
            ("cargo test -help", None),
            ("git diff --help", None),
            ("git diff -h", None),
            ("git help diff", None),
            ("kubectl get --help", None),
            ("pytest -h", None),
            ("eslint --help", None),
            // 9. Version flag bypass variants
            ("node --version", None),
            ("node -v", None),
            ("python -V", None),
            ("python3 --version", None),
            ("git --version", None),
            ("git -v", None),
            ("docker --version", None),
            ("docker -v", None),
            ("kubectl version", None),
            // 10. Debug / Verbose bypass variants
            ("cargo test --debug", None),
            ("cargo test --verbose", None),
            ("cargo test --trace", None),
            ("cargo test -vv", None),
            ("cargo test -vvv", None),
            ("cargo test --log-level=debug", None),
            ("cargo test --log-level=trace", None),
            ("pytest --verbose", None),
            ("pytest -vv", None),
            ("git diff --verbose", None),
            ("docker build --debug", None),
            ("kubectl get pods --log-level=debug", None),
            // 11. YAML format bypass variants
            ("kubectl get pods --yaml", None),
            ("kubectl get pods -o yaml", None),
            ("kubectl get pods -o=yaml", None),
            ("kubectl get pods --format yaml", None),
            ("kubectl get pods --format=yaml", None),
            ("docker inspect --format yaml", None),
            // 12. JSON format bypass vs match
            ("cargo test --json", None),                // no JSON support
            ("cargo test -o json", None),               // no JSON support
            ("cargo test -o=json", None),               // no JSON support
            ("cargo test --message-format=json", None), // no JSON support
            ("pytest --json", None),                    // no JSON support
            ("kubectl get pods -o json", Some("kubectl-get")), // supported!
            ("kubectl get pods -o=json", Some("kubectl-get")), // supported!
            ("kubectl get pods --format=json", Some("kubectl-get")), // supported!
            // 13. Collision & specificity
            ("git branch", Some("git-branch")),
            ("git branch -a", Some("git-branch")),
            ("git checkout -b branch", Some("git-broad")), // git-checkout falls back to git-broad
            ("git status", Some("git-broad")),             // git-status falls back to git-broad
            ("git status -s", Some("git-broad")),
            // 14. Complex combined expressions
            (
                "NODE_ENV=production PORT=8080 timeout 30s npx eslint --fix --ext .ts .",
                Some("eslint"),
            ),
            (
                "cd /app && time nice -n 5 pnpm exec eslint --cache",
                Some("eslint"),
            ),
            ("git -C /repo -c k=v diff --stat --verbose", None), // verbose bypasses
            ("kubectl --kubeconfig=config get pods -o yaml", None), // yaml bypasses
            (
                "cd /app && npm install --no-audit | git log --oneline --help",
                None,
            ), // help bypasses
            // 15. More combined and edge cases
            ("git", Some("git-broad")),
            ("git -v diff", None), // version flag bypasses
            ("nice nice nice cargo test", Some("cargo-test")), // nested wrappers
            ("cd app && pushd src && time cargo test", Some("cargo-test")),
            ("npx npx eslint", Some("eslint")),
            ("npx pnpm exec eslint", Some("eslint")),
            ("bunx bunx eslint", Some("eslint")),
            ("timeout 10 timeout 10 pytest", Some("pytest")),
            ("pytest -v --tb=short --json", None),
            ("kubectl get pods -o json -n kube-system --help", None), // help takes priority over JSON support
            ("kubectl get pods -o yaml -n default", None),            // YAML bypass
            ("kubectl get pods -o=yaml --log-level=debug", None),     // debug and YAML bypass
            ("git log --oneline --json", None),                       // no JSON support
            ("eslint --fix --format json", None),                     // no JSON support
        ];

        let mut test_cases: Vec<(String, Option<&str>)> = static_cases
            .into_iter()
            .map(|(cmd, expected)| (cmd.to_string(), expected))
            .collect();

        // 16. Dynamic generated matrix of combinatorial match and bypass variations (4000+ cases)
        let prefixes = [
            "",
            "CI=true",
            "timeout 30s",
            "cd src &&",
            "CI=true nice -n 10 timeout 5",
        ];

        let runners = ["", "npx", "pnpm exec", "uv run"];

        let tools = [
            ("cargo test", vec!["", " --workspace"], "cargo-test"),
            ("git diff", vec!["", " --stat --cached"], "git-diff"),
            ("docker build", vec!["", " -t tag ."], "docker-build"),
            ("pytest", vec!["", " tests/test_auth.py"], "pytest"),
            ("eslint", vec!["", " --fix"], "eslint"),
            ("kubectl get", vec!["", " pods"], "kubectl-get"),
        ];

        let shell_wrappers = ["", "bash -c", "powershell -Command"];

        for prefix in &prefixes {
            for runner in &runners {
                for (tool, args_list, expected) in &tools {
                    for arg in args_list {
                        let mut cmd_parts = Vec::new();
                        if !prefix.is_empty() {
                            cmd_parts.push(*prefix);
                        }
                        if !runner.is_empty() {
                            cmd_parts.push(*runner);
                        }
                        cmd_parts.push(*tool);
                        if !arg.is_empty() {
                            cmd_parts.push(arg.trim());
                        }
                        let base_cmd = cmd_parts.join(" ");

                        // Base match
                        test_cases.push((base_cmd.clone(), Some(*expected)));

                        // Wrapped match
                        for wrapper in &shell_wrappers {
                            if !wrapper.is_empty() {
                                test_cases.push((
                                    format!("{} \"{}\"", wrapper, base_cmd),
                                    Some(*expected),
                                ));
                            }
                        }

                        // Bypass variations
                        let bypass_flags = [
                            "--help",
                            "-h",
                            "--verbose",
                            "--debug",
                            "-vv",
                            "--yaml",
                            "-o yaml",
                        ];

                        for flag in &bypass_flags {
                            let bypass_cmd = format!("{} {}", base_cmd, flag);
                            test_cases.push((bypass_cmd.clone(), None));

                            for wrapper in &shell_wrappers {
                                if !wrapper.is_empty() {
                                    test_cases
                                        .push((format!("{} \"{}\"", wrapper, bypass_cmd), None));
                                }
                            }
                        }

                        // JSON bypass test
                        let json_cmd = format!("{} --json", base_cmd);
                        let json_expected = if *expected == "kubectl-get" {
                            Some("kubectl-get")
                        } else {
                            None
                        };
                        test_cases.push((json_cmd.clone(), json_expected));

                        for wrapper in &shell_wrappers {
                            if !wrapper.is_empty() {
                                test_cases
                                    .push((format!("{} \"{}\"", wrapper, json_cmd), json_expected));
                            }
                        }
                    }
                }
            }
        }

        for (cmd, expected) in &test_cases {
            assert_eq!(
                run(cmd),
                *expected,
                "command resolution failed for: {:?}",
                cmd
            );
        }
    }
}