use std::collections::HashMap;
use std::path::PathBuf;
use regex::Regex;
use rust_embed::Embed;
use serde::Deserialize;
#[derive(Debug, Deserialize, Clone)]
pub struct MatchOutput {
pub pattern: String,
pub message: String,
#[serde(default)]
pub unless: Option<String>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct FilterDef {
#[allow(dead_code)]
pub description: Option<String>,
pub match_command: String,
#[serde(default)]
pub strip_ansi: bool,
#[serde(default)]
pub strip_lines_matching: Vec<String>,
#[serde(default)]
pub keep_lines_matching: Vec<String>,
pub max_lines: Option<usize>,
pub head_lines: Option<usize>,
pub tail_lines: Option<usize>,
pub on_empty: Option<String>,
#[serde(default)]
pub match_output: Vec<MatchOutput>,
pub truncate_lines_at: Option<usize>,
#[serde(default)]
#[allow(dead_code)]
pub filter_stderr: bool,
#[serde(default)]
pub replace_patterns: Vec<[String; 2]>,
#[serde(default)]
pub extract_sections: Vec<ExtractSection>,
#[serde(default)]
pub semantic_filter: Option<SemanticFilterDef>,
#[serde(default)]
pub deduplicate_blocks: Option<DeduplicateBlocksDef>,
#[serde(default)]
pub summarize_json: Option<SummarizeJsonDef>,
pub token_budget: Option<usize>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct ExtractSection {
pub start_pattern: String,
pub end_pattern: String,
#[serde(default)]
pub include_markers: bool,
#[serde(default)]
pub max_matches: Option<usize>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct SemanticFilterDef {
pub query: String,
#[serde(default = "default_semantic_threshold")]
pub threshold: f32,
#[serde(default)]
pub always_keep: Vec<String>,
pub model: Option<String>,
}
fn default_semantic_threshold() -> f32 {
0.3
}
#[derive(Debug, Deserialize, Clone)]
pub struct DeduplicateBlocksDef {
#[serde(default = "default_min_block_lines")]
pub min_block_lines: usize,
#[serde(default = "default_block_similarity")]
pub similarity: f32,
#[serde(default)]
pub block_delimiter: Option<String>,
}
fn default_min_block_lines() -> usize {
3
}
fn default_block_similarity() -> f32 {
0.8
}
#[derive(Debug, Deserialize, Clone)]
pub struct SummarizeJsonDef {
#[serde(default = "default_max_array_items")]
pub max_array_items: usize,
#[serde(default = "default_max_depth")]
pub max_depth: usize,
#[serde(default)]
pub always_include: Vec<String>,
#[serde(default)]
pub exclude: Vec<String>,
}
fn default_max_array_items() -> usize {
10
}
fn default_max_depth() -> usize {
3
}
#[derive(Debug, Deserialize)]
struct FilterFile {
#[serde(default)]
filters: HashMap<String, FilterDef>,
}
pub struct ActiveFilter {
pub name: String,
pub source: &'static str,
pub filter: FilterDef,
}
#[derive(Embed)]
#[folder = "assets/filters"]
#[include = "*.toml"]
struct BundledFilters;
pub fn filters_dir() -> PathBuf {
dirs::home_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join(".tokenix")
.join("filters")
}
fn parse_filter_file_named(content: &str) -> Vec<(String, FilterDef)> {
toml::from_str::<FilterFile>(content)
.map(|f| f.filters.into_iter().collect())
.unwrap_or_default()
}
pub fn load_user_filters() -> Vec<FilterDef> {
load_user_filters_named()
.into_iter()
.map(|(_, f)| f)
.collect()
}
pub fn load_user_filters_named() -> Vec<(String, FilterDef)> {
let dir = filters_dir();
if !dir.exists() {
return vec![];
}
let mut result = Vec::new();
if let Ok(entries) = std::fs::read_dir(&dir) {
for entry in entries.flatten() {
let path = entry.path();
if path.extension().and_then(|e| e.to_str()) == Some("toml") {
if let Ok(content) = std::fs::read_to_string(&path) {
result.extend(parse_filter_file_named(&content));
}
}
}
}
result
}
pub fn load_local_filters_named() -> Vec<(String, FilterDef)> {
let cwd = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
let root = crate::store::find_project_root(&cwd);
let dir = root.join(".tokenix").join("filters");
if !dir.exists() {
return vec![];
}
let mut result = Vec::new();
if let Ok(entries) = std::fs::read_dir(&dir) {
for entry in entries.flatten() {
let path = entry.path();
if path.extension().and_then(|e| e.to_str()) == Some("toml") {
if let Ok(content) = std::fs::read_to_string(&path) {
result.extend(parse_filter_file_named(&content));
}
}
}
}
result
}
pub fn load_local_filters() -> Vec<FilterDef> {
load_local_filters_named()
.into_iter()
.map(|(_, f)| f)
.collect()
}
pub fn load_bundled_filters() -> Vec<FilterDef> {
load_bundled_filters_named()
.into_iter()
.map(|(_, f)| f)
.collect()
}
pub fn load_bundled_filters_named() -> Vec<(String, FilterDef)> {
BundledFilters::iter()
.filter_map(|name| {
let file = BundledFilters::get(&name)?;
let content = std::str::from_utf8(file.data.as_ref()).ok()?;
Some(parse_filter_file_named(content))
})
.flatten()
.collect()
}
pub fn load_active_filters() -> Vec<ActiveFilter> {
let mut result: Vec<ActiveFilter> = load_local_filters_named()
.into_iter()
.map(|(name, filter)| ActiveFilter {
name,
source: "local",
filter,
})
.collect();
result.extend(
load_user_filters_named()
.into_iter()
.map(|(name, filter)| ActiveFilter {
name,
source: "user",
filter,
}),
);
result.extend(
load_bundled_filters_named()
.into_iter()
.map(|(name, filter)| ActiveFilter {
name,
source: "bundled",
filter,
}),
);
result
}
pub fn load_all_filters() -> Vec<FilterDef> {
let mut all = load_local_filters();
all.extend(load_user_filters());
all.extend(load_bundled_filters());
all
}
pub fn find_filter<'a>(cmd: &str, filters: &'a [FilterDef]) -> Option<&'a FilterDef> {
let candidates = derive_command_candidates(cmd);
for f in filters {
if let Ok(re) = Regex::new(&f.match_command) {
for candidate in &candidates {
if re.is_match(candidate) {
return Some(f);
}
}
}
}
None
}
pub fn tokenize_command(command: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut current = String::new();
let mut quote: Option<char> = None;
let mut escaping = false;
for c in command.trim().chars() {
if escaping {
current.push(c);
escaping = false;
continue;
}
if c == '\\' {
escaping = true;
continue;
}
if let Some(q) = quote {
if c == q {
quote = None;
} else {
current.push(c);
}
continue;
}
if c == '\'' || c == '"' {
quote = Some(c);
continue;
}
if c.is_whitespace() {
if !current.is_empty() {
tokens.push(current);
current = String::new();
}
continue;
}
current.push(c);
}
if escaping {
current.push('\\');
}
if !current.is_empty() {
tokens.push(current);
}
tokens
}
pub fn unwrap_shell_runner(cmd: &str) -> Option<String> {
let argv = tokenize_command(cmd);
if argv.is_empty() {
return None;
}
let first = &argv[0];
let first_path = std::path::Path::new(first);
let launcher_name = first_path
.file_name()
.and_then(|f| f.to_str())
.unwrap_or(first)
.to_lowercase();
let launcher_name_no_ext = launcher_name.strip_suffix(".exe").unwrap_or(&launcher_name);
let is_shell = matches!(
launcher_name_no_ext,
"bash"
| "sh"
| "zsh"
| "fish"
| "dash"
| "ksh"
| "mksh"
| "ash"
| "csh"
| "tcsh"
| "cmd"
| "powershell"
| "pwsh"
);
if !is_shell {
return None;
}
for i in 1..(argv.len().saturating_sub(1)) {
let arg = &argv[i];
let is_command_flag = if launcher_name_no_ext == "cmd" {
arg.eq_ignore_ascii_case("/c") || arg.eq_ignore_ascii_case("-c")
} else if launcher_name_no_ext == "powershell" || launcher_name_no_ext == "pwsh" {
arg.eq_ignore_ascii_case("-c")
|| arg.eq_ignore_ascii_case("-command")
|| arg.eq_ignore_ascii_case("--command")
} else {
arg.starts_with('-') && arg.contains('c')
};
if is_command_flag {
return Some(argv[i + 1].trim().to_string());
}
}
None
}
fn is_env_assignment(s: &str) -> bool {
let bytes = s.as_bytes();
if bytes.is_empty() {
return false;
}
if !bytes[0].is_ascii_alphabetic() && bytes[0] != b'_' {
return false;
}
let mut i = 1;
while i < bytes.len() {
if bytes[i] == b'=' {
return i > 0;
}
if !bytes[i].is_ascii_alphanumeric() && bytes[i] != b'_' {
return false;
}
i += 1;
}
false
}
fn strip_leading_env_assignments(argv: &[String]) -> Vec<String> {
let mut index = 0;
while index < argv.len() && is_env_assignment(&argv[index]) {
index += 1;
}
if index < argv.len() {
let cmd_path = std::path::Path::new(&argv[index]);
let cmd_name = cmd_path
.file_name()
.and_then(|f| f.to_str())
.unwrap_or(&argv[index]);
if cmd_name == "env" {
index += 1;
while index < argv.len() {
let arg = &argv[index];
if arg == "--" {
index += 1;
break;
}
if is_env_assignment(arg) {
index += 1;
continue;
}
if arg == "-i" || arg == "-0" || arg == "--ignore-environment" || arg == "--debug" {
index += 1;
continue;
}
if arg == "-u"
|| arg == "--unset"
|| arg == "-C"
|| arg == "--chdir"
|| arg == "-S"
|| arg == "--split-string"
{
index += 2;
continue;
}
if arg.starts_with("--unset=")
|| arg.starts_with("--chdir=")
|| arg.starts_with("--split-string=")
{
index += 1;
continue;
}
break;
}
}
}
argv[index..].to_vec()
}
fn strip_cd_and_operators(mut argv: &[String]) -> &[String] {
for _ in 0..8 {
if argv.is_empty() {
break;
}
let first = &argv[0];
if first == "cd" || first == "pushd" {
if argv.len() >= 2 && (argv[1] == "&&" || argv[1] == ";") {
argv = &argv[2..];
continue;
}
if argv.len() >= 3 && (argv[2] == "&&" || argv[2] == ";") {
argv = &argv[3..];
continue;
}
}
break;
}
argv
}
pub fn get_effective_command(cmd: &str) -> String {
let mut current = cmd.trim().to_string();
for _ in 0..16 {
let unwrapped = unwrap_shell_runner(¤t);
if let Some(inner) = unwrapped {
current = inner;
continue;
}
let tokens = tokenize_command(¤t);
if tokens.is_empty() {
break;
}
let stripped_env = strip_leading_env_assignments(&tokens);
let stripped_cd = strip_cd_and_operators(&stripped_env);
if stripped_cd.len() == tokens.len() {
break;
}
current = stripped_cd.join(" ");
}
current
}
pub fn split_on_operators(cmd: &str) -> Vec<String> {
let mut segments = Vec::new();
let mut current = String::new();
let mut quote: Option<char> = None;
let mut escaping = false;
let chars: Vec<char> = cmd.chars().collect();
let mut i = 0;
while i < chars.len() {
let c = chars[i];
if escaping {
current.push(c);
escaping = false;
i += 1;
continue;
}
if c == '\\' {
current.push(c);
escaping = true;
i += 1;
continue;
}
if let Some(q) = quote {
current.push(c);
if c == q {
quote = None;
}
i += 1;
continue;
}
if c == '\'' || c == '"' {
quote = Some(c);
current.push(c);
i += 1;
continue;
}
let next = chars.get(i + 1).copied();
if (c == '&' && next == Some('&')) || (c == '|' && next == Some('|')) {
push_segment(&mut segments, &mut current);
i += 2;
continue;
}
if c == ';' || c == '|' {
push_segment(&mut segments, &mut current);
i += 1;
continue;
}
current.push(c);
i += 1;
}
push_segment(&mut segments, &mut current);
segments
}
fn push_segment(segments: &mut Vec<String>, current: &mut String) {
let trimmed = current.trim();
if !trimmed.is_empty() {
segments.push(trimmed.to_string());
}
current.clear();
}
fn push_unique(candidates: &mut Vec<String>, candidate: &str) {
let trimmed = candidate.trim();
if !trimmed.is_empty() && !candidates.iter().any(|c| c == trimmed) {
candidates.push(trimmed.to_string());
}
}
pub fn derive_command_candidates(cmd: &str) -> Vec<String> {
let mut candidates = Vec::new();
push_unique(&mut candidates, cmd);
let shell_body = unwrap_shell_runner(cmd);
if let Some(body) = &shell_body {
push_unique(&mut candidates, body);
}
push_unique(&mut candidates, &get_effective_command(cmd));
let mut bases = vec![cmd.to_string()];
if let Some(body) = shell_body {
bases.push(body);
}
for base in &bases {
for segment in split_on_operators(base) {
let effective = get_effective_command(&segment);
push_unique(&mut candidates, &segment);
push_unique(&mut candidates, &effective);
}
}
candidates
}
pub fn apply_filter(output: &str, f: &FilterDef) -> String {
for mo in &f.match_output {
if let Ok(re) = Regex::new(&mo.pattern) {
if re.is_match(output) {
if let Some(unless) = &mo.unless {
if Regex::new(unless)
.map(|u| u.is_match(output))
.unwrap_or(false)
{
continue;
}
}
return mo.message.clone();
}
}
}
let s = if f.strip_ansi {
crate::compress::strip_ansi(output)
} else {
output.to_string()
};
let mut lines: Vec<String> = s.lines().map(|l| l.to_string()).collect();
if !f.strip_lines_matching.is_empty() {
let patterns: Vec<Regex> = f
.strip_lines_matching
.iter()
.filter_map(|p| Regex::new(p).ok())
.collect();
lines.retain(|l| !patterns.iter().any(|re| re.is_match(l)));
}
if !f.keep_lines_matching.is_empty() {
let patterns: Vec<Regex> = f
.keep_lines_matching
.iter()
.filter_map(|p| Regex::new(p).ok())
.collect();
lines.retain(|l| patterns.iter().any(|re| re.is_match(l)));
}
if !f.extract_sections.is_empty() {
lines = apply_extract_sections(lines, &f.extract_sections);
}
if !f.replace_patterns.is_empty() {
lines = apply_replace_patterns(lines, &f.replace_patterns);
}
if let Some(dedup) = &f.deduplicate_blocks {
lines = apply_deduplicate_blocks(lines, dedup);
}
if let Some(semantic) = &f.semantic_filter {
lines = apply_semantic_filter(lines, semantic);
}
if let Some(summarize) = &f.summarize_json {
lines = apply_summarize_json(lines, summarize);
}
let lines = apply_sizing(lines, f);
let mut result = if let Some(max_len) = f.truncate_lines_at {
lines
.iter()
.map(|l| truncate_at_char_boundary(l, max_len))
.collect::<Vec<_>>()
.join("\n")
} else {
lines.join("\n")
};
if let Some(budget) = f.token_budget {
result = apply_token_budget(&result, budget);
}
if result.trim().is_empty() {
if let Some(msg) = &f.on_empty {
return msg.clone();
}
}
result
}
fn apply_extract_sections(lines: Vec<String>, sections: &[ExtractSection]) -> Vec<String> {
let mut result = Vec::new();
let content = lines.join("\n");
for section in sections {
let start_re = match Regex::new(§ion.start_pattern) {
Ok(r) => r,
Err(_) => continue,
};
let end_re = match Regex::new(§ion.end_pattern) {
Ok(r) => r,
Err(_) => continue,
};
let mut matches = 0;
let max_matches = section.max_matches.unwrap_or(usize::MAX);
let mut in_section = false;
let mut section_lines = Vec::new();
for line in content.lines() {
let start_match = start_re.is_match(line);
let end_match = end_re.is_match(line);
if start_match && !in_section {
in_section = true;
if section.include_markers {
section_lines.push(line.to_string());
}
continue;
}
if in_section {
if section.include_markers || !end_match {
section_lines.push(line.to_string());
}
if end_match {
result.append(&mut section_lines);
matches += 1;
in_section = false;
if matches >= max_matches {
break;
}
}
}
}
if in_section && section.include_markers {
result.extend(section_lines);
}
}
if result.is_empty() {
lines
} else {
result
}
}
fn apply_replace_patterns(lines: Vec<String>, patterns: &[[String; 2]]) -> Vec<String> {
lines
.into_iter()
.map(|mut line| {
for [pattern, replacement] in patterns {
if let Ok(re) = Regex::new(pattern) {
line = re.replace_all(&line, replacement.as_str()).to_string();
}
}
line
})
.collect()
}
fn apply_deduplicate_blocks(lines: Vec<String>, dedup: &DeduplicateBlocksDef) -> Vec<String> {
let delimiter = dedup.block_delimiter.as_deref().unwrap_or(r"^\s*$");
let delim_re = match Regex::new(delimiter) {
Ok(r) => r,
Err(_) => return lines,
};
let mut blocks: Vec<Vec<String>> = Vec::new();
let mut current_block = Vec::new();
for line in &lines {
if delim_re.is_match(line) && !current_block.is_empty() {
if current_block.len() >= dedup.min_block_lines {
blocks.push(current_block);
}
current_block = Vec::new();
} else {
current_block.push(line.clone());
}
}
if !current_block.is_empty() && current_block.len() >= dedup.min_block_lines {
blocks.push(current_block);
}
if blocks.len() < 2 {
return lines;
}
let mut result = Vec::new();
let mut i = 0;
while i < blocks.len() {
let block = &blocks[i];
result.extend(block.iter().cloned());
let mut j = i + 1;
let mut similar_count = 0;
while j < blocks.len() {
if blocks_similar(block, &blocks[j], dedup.similarity) {
similar_count += 1;
j += 1;
} else {
break;
}
}
if similar_count > 0 {
result.push(format!(
"[... {} similar block(s) omitted ...]",
similar_count
));
i = j;
} else {
i += 1;
}
}
result
}
fn blocks_similar(a: &[String], b: &[String], threshold: f32) -> bool {
if a.len() != b.len() {
return false;
}
let matches = a.iter().zip(b.iter()).filter(|(x, y)| x == y).count();
(matches as f32 / a.len() as f32) >= threshold
}
fn apply_semantic_filter(lines: Vec<String>, semantic: &SemanticFilterDef) -> Vec<String> {
if let Ok(filtered) = apply_semantic_filter_with_embeddings(&lines, semantic) {
return filtered;
}
apply_semantic_filter_keyword_fallback(lines, semantic)
}
fn apply_semantic_filter_with_embeddings(
lines: &[String],
semantic: &SemanticFilterDef,
) -> Result<Vec<String>, anyhow::Error> {
use crate::embed::{embed_query, set_active_model};
if let Some(model) = &semantic.model {
set_active_model(model);
}
let query_vec = embed_query(&semantic.query)?;
let always_keep_patterns: Vec<Regex> = semantic
.always_keep
.iter()
.filter_map(|p| Regex::new(p).ok())
.collect();
let mut results = Vec::new();
for line in lines {
if always_keep_patterns.iter().any(|re| re.is_match(line)) {
results.push(line.clone());
continue;
}
if line.trim().len() < 5 {
continue;
}
let line_vec = embed_query(line)?;
let similarity = cosine_similarity(&query_vec, &line_vec);
if similarity >= semantic.threshold {
results.push(line.clone());
}
}
Ok(results)
}
fn apply_semantic_filter_keyword_fallback(
lines: Vec<String>,
semantic: &SemanticFilterDef,
) -> Vec<String> {
let query_terms: Vec<&str> = semantic.query.split_whitespace().collect();
let always_keep_patterns: Vec<Regex> = semantic
.always_keep
.iter()
.filter_map(|p| Regex::new(p).ok())
.collect();
lines
.into_iter()
.filter(|line| {
if always_keep_patterns.iter().any(|re| re.is_match(line)) {
return true;
}
let line_lower = line.to_lowercase();
query_terms
.iter()
.any(|term| line_lower.contains(&term.to_lowercase()))
})
.collect()
}
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
if a.len() != b.len() || a.is_empty() {
return 0.0;
}
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm_a == 0.0 || norm_b == 0.0 {
0.0
} else {
dot / (norm_a * norm_b)
}
}
fn apply_summarize_json(lines: Vec<String>, summarize: &SummarizeJsonDef) -> Vec<String> {
let content = lines.join("\n");
let trimmed = content.trim();
if !(trimmed.starts_with('{') || trimmed.starts_with('[')) {
return lines;
}
let Ok(mut value) = serde_json::from_str::<serde_json::Value>(trimmed) else {
return lines;
};
summarize_json_value(&mut value, summarize, 0);
let result = serde_json::to_string_pretty(&value).unwrap_or(content);
result.lines().map(|l| l.to_string()).collect()
}
fn summarize_json_value(value: &mut serde_json::Value, summarize: &SummarizeJsonDef, depth: usize) {
if depth >= summarize.max_depth {
return;
}
match value {
serde_json::Value::Object(map) => {
let keys_to_remove: Vec<String> =
map.keys()
.filter(|k| {
let path = if depth == 0 { k.as_str() } else { "" };
summarize.exclude.iter().any(|ex| {
k.as_str() == ex.as_str() || (depth == 0 && path == ex.as_str())
})
})
.cloned()
.collect();
for k in keys_to_remove {
map.remove(&k);
}
for (k, v) in map.iter_mut() {
let full_path = if depth == 0 {
k.clone()
} else {
format!("{}.{}", depth, k)
};
if summarize
.always_include
.iter()
.any(|inc| inc == &full_path || inc == k)
{
continue;
}
summarize_json_value(v, summarize, depth + 1);
}
}
serde_json::Value::Array(arr) => {
if arr.len() > summarize.max_array_items {
let shown = arr.drain(summarize.max_array_items..).collect::<Vec<_>>();
let count = shown.len();
arr.push(serde_json::Value::String(format!(
"... {} more item(s) omitted ...",
count
)));
}
for item in arr.iter_mut() {
summarize_json_value(item, summarize, depth + 1);
}
}
_ => {}
}
}
fn apply_token_budget(text: &str, budget: usize) -> String {
let tokens = crate::chunker::count_tokens(text);
if tokens <= budget {
return text.to_string();
}
let lines: Vec<&str> = text.lines().collect();
if lines.is_empty() {
return text.to_string();
}
let mut priority_lines = Vec::new();
let mut other_lines = Vec::new();
for (i, line) in lines.iter().enumerate() {
let t = line.trim();
let is_high_priority = t.starts_with("error")
|| t.starts_with("warning")
|| t.starts_with("FAIL")
|| t.starts_with("panic")
|| t.contains("error[")
|| t.contains("warning[")
|| i < lines.len() / 4
|| i >= lines.len() * 3 / 4;
if is_high_priority {
priority_lines.push((i, *line));
} else {
other_lines.push((i, *line));
}
}
let mut result = Vec::new();
let mut used = 0usize;
for (_, line) in priority_lines {
let line_tokens = crate::chunker::count_tokens(line);
if used + line_tokens > budget {
break;
}
result.push(line.to_string());
used += line_tokens;
}
for (_, line) in other_lines {
let line_tokens = crate::chunker::count_tokens(line);
if used + line_tokens > budget {
break;
}
result.push(line.to_string());
used += line_tokens;
}
if result.len() < lines.len() {
result.push(format!(
"[... {} lines omitted to fit token budget {} ...]",
lines.len() - result.len(),
budget
));
}
result.join("\n")
}
fn truncate_at_char_boundary(s: &str, max_bytes: usize) -> &str {
if s.len() <= max_bytes {
return s;
}
let mut end = max_bytes;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
}
fn apply_sizing(mut lines: Vec<String>, f: &FilterDef) -> Vec<String> {
if let Some(head) = f.head_lines {
lines.truncate(head);
} else if let Some(tail) = f.tail_lines {
let len = lines.len();
if len > tail {
lines = lines[len - tail..].to_vec();
}
} else if let Some(max) = f.max_lines {
lines.truncate(max);
}
lines
}
pub fn build_filter_prompt(command: &str, sample_output: &str) -> String {
format!(
r#"Generate a tokenix TOML filter for the command `{command}`.
TOML filter schema (all fields optional except match_command):
```
[filters.<slug>]
description = "human-readable purpose"
match_command = "^regex_to_match_full_command_line"
strip_ansi = true # remove ANSI color codes
strip_lines_matching = ["^pattern1", "^pattern2"] # drop noisy lines
keep_lines_matching = ["error", "warning"] # keep only signal lines
match_output = [ # short-circuit: if output matches pattern, return message
{{ pattern = "already installed", message = "ok (already installed)" }},
# optional `unless`: skip the short-circuit if output also matches it (avoids masking errors)
{{ pattern = "Build complete!", message = "ok (build complete)", unless = "warning:|error:" }},
]
max_lines = 50 # truncate to N lines
head_lines = 30 # keep first N lines
tail_lines = 10 # keep last N lines
truncate_lines_at = 120 # truncate individual lines at N chars
on_empty = "command: ok" # message when filter produces empty output
# ADVANCED (extended filtering capabilities):
replace_patterns = [ # regex replacements: [[pattern, replacement], ...]
["\\d+\\.\\d+s", "<duration>"],
["/home/[^/]+/", "~/"],
]
extract_sections = [ # extract content between markers
{{ start_pattern = "---- FAILURES ----", end_pattern = "^\\s*$", include_markers = true, max_matches = 3 }},
]
semantic_filter = {{ # embedding-based relevance filtering (uses daemon/embed)
query = "test failure error panic",
threshold = 0.3,
always_keep = ["^error\\[", "^FAIL"],
model = "nomic-v1.5"
}}
deduplicate_blocks = {{ # structural block deduplication
min_block_lines = 3,
similarity = 0.8,
block_delimiter = "^\\s*$"
}}
summarize_json = {{ # intelligent JSON summarization
max_array_items = 10,
max_depth = 3,
always_include = ["packages", "workspace_members"],
exclude = ["manifest", "dependencies"]
}}
token_budget = 2000 # hard token limit with smart truncation
```
Rules:
- Use strip_lines_matching to drop boilerplate (progress, verbose info)
- Use keep_lines_matching only if output has a clear signal/noise separation
- Use match_output for commands that succeed silently or with a predictable summary line
- Set on_empty when the command normally succeeds silently
- Use replace_patterns to normalize paths, timestamps, IDs, etc.
- Use extract_sections to pull out failure blocks, error sections, etc.
- Use semantic_filter for query-aware relevance (requires embed model)
- Use deduplicate_blocks for repetitive output (test runs, build steps)
- Use summarize_json for large JSON (cargo metadata, API responses)
- Use token_budget as a hard cap with priority-based truncation
- match_command must be a valid Rust regex matching `{command}` or its typical invocations
- Return ONLY valid TOML, no markdown code fences, no explanations
Sample output from `{command} --help` (or similar):
---
{sample_output}
---
TOML filter:"#
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_load_local_filters() {
let temp_dir = std::env::current_dir()
.unwrap()
.join(".tokenix")
.join("filters");
std::fs::create_dir_all(&temp_dir).unwrap();
let toml_path = temp_dir.join("test_local_cmd.toml");
std::fs::write(
&toml_path,
r#"
[filters.test_local_cmd]
description = "test local"
match_command = "^test_local_cmd$"
on_empty = "empty filter output"
"#,
)
.unwrap();
let local_filters = load_local_filters();
assert!(!local_filters.is_empty());
let found = find_filter("test_local_cmd", &local_filters);
assert!(found.is_some());
let filter = found.unwrap();
assert_eq!(filter.on_empty.as_deref(), Some("empty filter output"));
let _ = std::fs::remove_file(&toml_path);
let _ = std::fs::remove_dir_all(
std::env::current_dir()
.unwrap()
.join(".tokenix")
.join("filters"),
);
}
#[test]
fn test_tokenize_command() {
assert_eq!(tokenize_command("cargo test"), vec!["cargo", "test"]);
assert_eq!(
tokenize_command("echo \"hello world\""),
vec!["echo", "hello world"]
);
assert_eq!(
tokenize_command("env CI=true cargo test"),
vec!["env", "CI=true", "cargo", "test"]
);
}
#[test]
fn test_unwrap_shell_runner() {
assert_eq!(
unwrap_shell_runner("bash -c 'cargo test'"),
Some("cargo test".to_string())
);
assert_eq!(
unwrap_shell_runner("powershell -Command \"cargo test\""),
Some("cargo test".to_string())
);
assert_eq!(
unwrap_shell_runner("cmd.exe /c \"cargo test\""),
Some("cargo test".to_string())
);
assert_eq!(unwrap_shell_runner("cargo test"), None);
}
#[test]
fn test_get_effective_command() {
assert_eq!(
get_effective_command("cd /app && CI=true cargo test"),
"cargo test"
);
assert_eq!(
get_effective_command("bash -c 'cd /app && CI=true env cargo test'"),
"cargo test"
);
assert_eq!(
get_effective_command("env CI=true cargo test"),
"cargo test"
);
}
#[test]
fn test_derive_command_candidates() {
let cmd = "bash -c 'cd /app && cargo test'";
let candidates = derive_command_candidates(cmd);
assert!(candidates.contains(&"bash -c 'cd /app && cargo test'".to_string()));
assert!(candidates.contains(&"cd /app && cargo test".to_string()));
assert!(candidates.contains(&"cargo test".to_string()));
}
#[test]
fn truncate_at_char_boundary_handles_multibyte() {
assert_eq!(truncate_at_char_boundary("hello world", 5), "hello");
assert_eq!(truncate_at_char_boundary("hi", 10), "hi");
let s = "café latte"; let out = truncate_at_char_boundary(s, 4);
assert!(s.starts_with(out));
assert_eq!(out, "caf"); }
#[test]
fn apply_filter_truncate_lines_at_no_panic_on_utf8() {
let f = FilterDef {
description: None,
match_command: ".*".to_string(),
strip_ansi: false,
strip_lines_matching: vec![],
keep_lines_matching: vec![],
max_lines: None,
head_lines: None,
tail_lines: None,
on_empty: None,
match_output: vec![],
truncate_lines_at: Some(4),
filter_stderr: false,
replace_patterns: vec![],
extract_sections: vec![],
semantic_filter: None,
deduplicate_blocks: None,
summarize_json: None,
token_budget: None,
};
let out = apply_filter("café\nação\n", &f);
assert_eq!(out, "caf\naç");
}
#[test]
fn split_on_operators_handles_compound_commands() {
assert_eq!(
split_on_operators("cd foo && gitleaks detect"),
vec!["cd foo", "gitleaks detect"]
);
assert_eq!(
split_on_operators("cd foo;gitleaks"),
vec!["cd foo", "gitleaks"]
);
assert_eq!(split_on_operators("a || b"), vec!["a", "b"]);
assert_eq!(
split_on_operators("producer | gitleaks detect"),
vec!["producer", "gitleaks detect"]
);
assert_eq!(
split_on_operators(r#"echo "a;b" && x"#),
vec![r#"echo "a;b""#, "x"]
);
}
#[test]
fn derive_candidates_segments_compound_commands() {
let candidates = derive_command_candidates("cd foo;gitleaks detect --source .");
assert!(
candidates.iter().any(|c| c == "gitleaks detect --source ."),
"expected a gitleaks segment candidate, got: {candidates:?}"
);
}
#[test]
fn find_filter_matches_command_after_cd_and_pipe() {
let f = FilterDef {
description: None,
match_command: "^gitleaks\\b".to_string(),
strip_ansi: false,
strip_lines_matching: vec![],
keep_lines_matching: vec![],
max_lines: None,
head_lines: None,
tail_lines: None,
on_empty: None,
match_output: vec![],
truncate_lines_at: None,
filter_stderr: false,
replace_patterns: vec![],
extract_sections: vec![],
semantic_filter: None,
deduplicate_blocks: None,
summarize_json: None,
token_budget: None,
};
let filters = [f];
assert!(find_filter("cd repo;gitleaks detect", &filters).is_some());
assert!(find_filter("npm i && gitleaks detect", &filters).is_some());
assert!(find_filter("cat x | gitleaks detect", &filters).is_some());
assert!(find_filter("echo gitleaks", &filters).is_none());
}
#[test]
fn apply_filter_match_output_unless_guards_errors() {
let f = FilterDef {
description: None,
match_command: ".*".to_string(),
strip_ansi: false,
strip_lines_matching: vec![],
keep_lines_matching: vec![],
max_lines: None,
head_lines: None,
tail_lines: None,
on_empty: None,
match_output: vec![MatchOutput {
pattern: "total size is".to_string(),
message: "ok (synced)".to_string(),
unless: Some("error|failed".to_string()),
}],
truncate_lines_at: None,
filter_stderr: false,
replace_patterns: vec![],
extract_sections: vec![],
semantic_filter: None,
deduplicate_blocks: None,
summarize_json: None,
token_budget: None,
};
assert_eq!(apply_filter("total size is 100\n", &f), "ok (synced)");
let out = apply_filter("rsync error\ntotal size is 100\n", &f);
assert!(out.contains("error"), "error must not be masked: {out:?}");
}
#[derive(Debug, Deserialize)]
struct GoldenCase {
#[serde(default)]
name: Option<String>,
input: String,
expected: String,
}
#[derive(Debug, Deserialize)]
struct FilterTestFile {
#[serde(default)]
filters: HashMap<String, FilterDef>,
#[serde(default)]
tests: HashMap<String, Vec<GoldenCase>>,
}
#[test]
fn bundled_filters_pass_embedded_golden_tests() {
let mut total = 0usize;
let mut files_with_tests = 0usize;
let mut failures: Vec<String> = Vec::new();
for asset in BundledFilters::iter() {
let file = BundledFilters::get(&asset).expect("bundled asset readable");
let content = std::str::from_utf8(file.data.as_ref()).expect("filter is utf8");
let parsed: FilterTestFile = match toml::from_str(content) {
Ok(p) => p,
Err(e) => {
failures.push(format!("{asset}: TOML parse error: {e}"));
continue;
}
};
if !parsed.tests.is_empty() {
files_with_tests += 1;
}
for (fname, cases) in &parsed.tests {
let Some(fdef) = parsed.filters.get(fname) else {
failures.push(format!(
"{asset}: [[tests.{fname}]] references undefined [filters.{fname}]"
));
continue;
};
for (i, case) in cases.iter().enumerate() {
total += 1;
let got = apply_filter(&case.input, fdef);
if got.trim_end() != case.expected.trim_end() {
let label = case.name.clone().unwrap_or_else(|| format!("#{i}"));
failures.push(format!(
"{asset} [{fname} / {label}]\n expected: {:?}\n got: {:?}",
case.expected, got
));
}
}
}
}
eprintln!(
"golden: ran {total} embedded cases across {files_with_tests} bundled filter files"
);
assert!(
failures.is_empty(),
"{} bundled golden filter case(s) failed:\n\n{}",
failures.len(),
failures.join("\n\n")
);
}
}