use anyhow::{anyhow, Result};
use std::collections::{BTreeSet, HashMap, HashSet};
use std::path::Path;
use crate::chunker::count_tokens;
use crate::embed::embed_query;
use crate::store::{
fetch_chunks_by_ids, hybrid_search, open_db, search_fts, search_graph_nodes, SearchResult,
};
pub fn query_index(
repo_root: &Path,
query_text: &str,
budget: usize,
k: usize,
file_filter: Option<&str>,
) -> Result<Option<Vec<SearchResult>>> {
let conn = match open_db(repo_root, false)? {
Some(c) => c,
None => return Ok(None),
};
let vec = embed_query(query_text)?;
let candidate_k = (k.saturating_mul(5)).max(50);
let mut results = hybrid_search(&conn, &vec, query_text, candidate_k, file_filter)?;
add_symbol_recall_candidates(&conn, &mut results, query_text, file_filter)?;
rerank_results(&mut results, query_text);
let mut selected = Vec::new();
let mut used_tokens = 0usize;
for r in results.into_iter().take(k) {
let tokens = crate::embed::count_tokens_accurate(&r.content);
if used_tokens + tokens > budget {
continue;
}
used_tokens += tokens;
selected.push(r);
}
Ok(Some(selected))
}
fn add_symbol_recall_candidates(
conn: &rusqlite::Connection,
results: &mut Vec<SearchResult>,
query_text: &str,
file_filter: Option<&str>,
) -> Result<()> {
let mut seen: HashSet<i64> = results.iter().map(|result| result.id).collect();
let mut ids = Vec::new();
for term in query_terms(query_text).into_iter().take(10) {
for node in search_graph_nodes(conn, &term, 8)? {
if file_filter.is_some_and(|filter| !node.path.contains(filter)) {
continue;
}
if seen.insert(node.chunk_id) {
ids.push(node.chunk_id);
}
}
}
add_path_recall_candidates(conn, &mut seen, &mut ids, query_text, file_filter)?;
let terms = query_terms(query_text);
let asks_token_savings =
terms.iter().any(|t| t == "token") && terms.iter().any(|t| t.starts_with("sav"));
let asks_hook_log = terms.iter().any(|t| t == "hook") && terms.iter().any(|t| t == "log");
if asks_token_savings && asks_hook_log {
let mut stmt = conn.prepare(
"SELECT id FROM chunks
WHERE path IN ('src/gain.rs', 'src/store.rs')
ORDER BY CASE path WHEN 'src/gain.rs' THEN 0 ELSE 1 END, start_line
LIMIT 12",
)?;
let rows = stmt.query_map([], |row| row.get::<_, i64>(0))?;
for id in rows.flatten() {
if seen.insert(id) {
ids.push(id);
}
}
for id in search_fts(
conn,
"compute_gain read_hook_log tokens_saved original_estimate",
12,
file_filter,
)? {
if seen.insert(id) {
ids.push(id);
}
}
}
for mut chunk in fetch_chunks_by_ids(conn, &ids)? {
chunk.distance = 0.45;
results.push(chunk);
}
Ok(())
}
fn add_path_recall_candidates(
conn: &rusqlite::Connection,
seen: &mut HashSet<i64>,
ids: &mut Vec<i64>,
query_text: &str,
file_filter: Option<&str>,
) -> Result<()> {
for term in query_terms(query_text)
.into_iter()
.filter(|term| is_path_recall_term(term))
.take(8)
{
let pattern = format!("%{}%", term);
let mut stmt = conn.prepare(
"SELECT id FROM chunks
WHERE lower(path) LIKE ?1
ORDER BY token_count, start_line
LIMIT 8",
)?;
let rows = stmt.query_map([pattern], |row| row.get::<_, i64>(0))?;
for id in rows.flatten() {
if seen.contains(&id) {
continue;
}
if let Some(filter) = file_filter {
let path: String =
conn.query_row("SELECT path FROM chunks WHERE id = ?1", [id], |row| {
row.get(0)
})?;
if !path.contains(filter) {
continue;
}
}
seen.insert(id);
ids.push(id);
}
}
Ok(())
}
fn is_path_recall_term(term: &str) -> bool {
term.chars().all(|c| c.is_ascii_digit())
|| matches!(
term,
"pwa"
| "coerencia"
| "coherence"
| "composer"
| "pipeline"
| "manifest"
| "capacitor"
| "chapter"
| "capitulo"
| "chunk"
| "chunker"
)
}
pub fn rerank_results(results: &mut [SearchResult], query: &str) {
let terms = query_terms(query);
if terms.is_empty() {
return;
}
results.sort_by(|a, b| {
let sa = hybrid_score(a, &terms);
let sb = hybrid_score(b, &terms);
sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal)
});
}
fn hybrid_score(result: &SearchResult, terms: &[String]) -> f32 {
let semantic = 1.0 - result.distance;
semantic + lexical_boost(result, terms)
}
fn lexical_boost(result: &SearchResult, terms: &[String]) -> f32 {
let path = normalize_text(&result.path);
let path_stem = result
.path
.rsplit('/')
.next()
.and_then(|name| name.split('.').next())
.map(normalize_text)
.unwrap_or_default();
let symbol = normalize_text(&result.symbol);
let content = normalize_text(&result.content);
let mut boost = 0.0f32;
let mut matched_terms = 0usize;
for term in terms {
let mut matched = false;
if path_stem == *term {
boost += 0.55;
matched = true;
}
if path.contains(term) {
boost += 0.28;
matched = true;
}
if !symbol.is_empty() && symbol.contains(term) {
boost += 0.24;
matched = true;
}
if content.contains(term) {
boost += 0.045;
matched = true;
}
if matched {
matched_terms += 1;
}
}
boost += (matched_terms as f32 * 0.08).min(0.5);
boost += intent_boost(&path, &symbol, &content, terms);
boost += domain_boost(&path, &symbol, &content, terms);
boost += project_intent_boost(&path, terms);
boost += language_boost(&path, terms);
boost += benchmark_leak_penalty(&path, terms);
boost += test_leak_penalty(&symbol, &content, terms);
boost += markdown_doc_penalty(&path, terms);
boost += non_code_asset_penalty(&path, terms);
boost.min(2.5)
}
fn intent_boost(path: &str, symbol: &str, content: &str, terms: &[String]) -> f32 {
let mut boost = 0.0;
let has_hook = terms.iter().any(|t| t == "hook") || path.contains("hook");
let has_fail_open = terms.iter().any(|t| t == "fail") && terms.iter().any(|t| t == "open");
if has_hook
&& has_fail_open
&& (content.contains("exit 0")
|| content.contains("process exit 0")
|| content.contains("pass through")
|| content.contains("action pass")
|| symbol.contains("run_hook"))
{
boost += 0.75;
}
let has_stale_index =
terms.iter().any(|t| t == "stale") && terms.iter().any(|t| t == "index" || t == "missing");
if has_hook
&& has_stale_index
&& (content.contains("index_staleness")
|| content.contains("staleness stale")
|| content.contains("max_index_age_secs"))
{
boost += 0.55;
}
let asks_token_savings =
terms.iter().any(|t| t == "token") && terms.iter().any(|t| t.starts_with("sav"));
let asks_hook_log = terms.iter().any(|t| t == "hook") && terms.iter().any(|t| t == "log");
if asks_token_savings
&& asks_hook_log
&& (path.contains("gain")
|| symbol.contains("compute_gain")
|| content.contains("read_hook_log")
|| content.contains("tokens_saved")
|| content.contains("original_estimate"))
{
boost += if path.contains("gain") || symbol.contains("compute_gain") {
1.5
} else if content.contains("read_hook_log") {
1.1
} else {
0.45
};
}
let asks_output_compression = terms.iter().any(|t| t == "output")
&& terms.iter().any(|t| t.starts_with("compress"))
&& (terms.iter().any(|t| t == "cargo") || terms.iter().any(|t| t.starts_with("error")));
if asks_output_compression
&& (path.contains("compress")
|| symbol.contains("compress_cargo")
|| symbol.contains("compress_bash_output")
|| content.contains("compress cargo")
|| content.contains("cargo output"))
{
boost += 1.2;
}
boost
}
fn domain_boost(path: &str, symbol: &str, content: &str, terms: &[String]) -> f32 {
let mut boost = 0.0;
let asks_chunking = has_any(terms, &["chunk", "chunker", "symbol", "outline", "outlin"])
&& has_any(terms, &["rust", "file", "files", "code", "agent"]);
if asks_chunking
&& (path.contains("chunker")
|| symbol.contains("chunk_file")
|| symbol.contains("chunk_rust")
|| symbol.contains("generate_outline")
|| content.contains("chunk_file")
|| content.contains("generate_outline")
|| content.contains("symbol aware"))
{
boost += 1.4;
}
let has_db_query = terms.iter().any(|t| {
matches!(
t.as_str(),
"postgres" | "postgresql" | "sqlite" | "sql" | "transaction" | "pool"
)
});
if has_db_query
&& (path.contains("database")
|| path.contains("db")
|| symbol.contains("pool")
|| symbol.contains("transaction")
|| content.contains("postgres")
|| content.contains("from pg"))
{
boost += 0.18;
}
let asks_vector_similarity = terms
.iter()
.any(|t| matches!(t.as_str(), "cosine" | "similarity" | "vector"))
&& terms
.iter()
.any(|t| matches!(t.as_str(), "sqlite" | "search" | "implemented"));
if asks_vector_similarity
&& (path.contains("store")
|| symbol.contains("cosine_similarity")
|| content.contains("cosine similarity")
|| content.contains("cosine_similarity_to_bytes"))
{
boost += 1.0;
}
boost
}
fn project_intent_boost(path: &str, terms: &[String]) -> f32 {
let mut boost = 0.0;
let asks_test = has_any(
terms,
&[
"test",
"tests",
"pytest",
"spec",
"pwa",
"capacitor",
"manifest",
"mobile",
],
);
if asks_test {
if path.starts_with("tests ") {
boost += 0.55;
}
if path.contains("test pwa") {
boost += 2.2;
}
if has_any(terms, &["pwa", "capacitor", "manifest"]) && path.contains("links") {
boost -= 1.1;
}
}
let asks_pipeline = has_any(
terms,
&[
"pipeline",
"generator",
"generation",
"geracao",
"imagem",
"image",
"video",
"composer",
"client",
],
);
if asks_pipeline {
if path.starts_with("scripts ") {
boost += 0.5;
}
if path.contains("art gen") || path.contains("video gen") {
boost += 0.45;
}
if path.ends_with(" md") && !has_any(terms, &["capitulo", "chapter", "narrativa"]) {
boost -= 0.75;
}
}
let asks_report = has_any(
terms,
&[
"analise",
"analysis",
"coerencia",
"coherence",
"relatorio",
"report",
"recomendacoes",
"recommendations",
"auditoria",
"audit",
],
);
if asks_report {
if path.starts_with("docs ") {
boost += 0.45;
}
if path.contains("analise coerencia") {
boost += 1.0;
}
}
let asks_narrative = has_any(
terms,
&[
"capitulo",
"chapter",
"personagem",
"personagens",
"narrativa",
"cronologia",
"gabo",
"valeria",
"aria",
"rangel",
],
);
if asks_narrative {
if path.starts_with("docs public ") || path.starts_with("cronologia ") {
boost += 0.65;
}
if path.starts_with("scripts ") && !asks_pipeline {
boost -= 0.45;
}
}
boost
}
fn has_any(terms: &[String], needles: &[&str]) -> bool {
terms
.iter()
.any(|term| needles.iter().any(|needle| term == needle))
}
fn benchmark_leak_penalty(path: &str, terms: &[String]) -> f32 {
let asks_benchmark = terms
.iter()
.any(|t| matches!(t.as_str(), "benchmark" | "bench" | "evaluation" | "test"));
if !asks_benchmark && path == "src benchmark rs" {
return -0.8;
}
0.0
}
fn test_leak_penalty(symbol: &str, content: &str, terms: &[String]) -> f32 {
let asks_test = terms
.iter()
.any(|t| matches!(t.as_str(), "test" | "tests" | "benchmark" | "bench"));
if asks_test {
return 0.0;
}
let looks_like_test = symbol.starts_with("test ")
|| symbol.contains(" test")
|| symbol.starts_with("rerank ")
|| content.contains("assert ")
|| content.contains("assert eq");
if looks_like_test {
return -1.5;
}
0.0
}
fn markdown_doc_penalty(path: &str, terms: &[String]) -> f32 {
let asks_docs = terms.iter().any(|t| {
matches!(
t.as_str(),
"doc" | "docs" | "readme" | "agent" | "agents" | "instruction" | "instructions"
)
});
let code_task = terms.iter().any(|t| {
matches!(
t.as_str(),
"code"
| "function"
| "symbol"
| "chunk"
| "index"
| "hook"
| "rust"
| "typescript"
| "python"
| "method"
| "class"
)
});
if code_task && !asks_docs && (path.ends_with(" md") || path.contains("agents md")) {
return -0.9;
}
0.0
}
fn language_boost(path: &str, terms: &[String]) -> f32 {
let mut boost = 0.0;
if terms.iter().any(|t| t == "rust") && path.ends_with(" rs") {
boost += 0.35;
}
if terms.iter().any(|t| t == "typescript") && path.ends_with(" ts") {
boost += 0.35;
}
if terms.iter().any(|t| t == "python") && path.ends_with(" py") {
boost += 0.35;
}
boost
}
fn non_code_asset_penalty(path: &str, terms: &[String]) -> f32 {
let asks_filter_or_config = terms.iter().any(|t| {
matches!(
t.as_str(),
"filter" | "filters" | "config" | "configuration" | "toml" | "yaml" | "json"
)
});
let code_task = terms.iter().any(|t| {
matches!(
t.as_str(),
"code"
| "function"
| "symbol"
| "chunk"
| "index"
| "hook"
| "rust"
| "typescript"
| "python"
| "method"
| "class"
)
});
if code_task
&& !asks_filter_or_config
&& (path.starts_with("assets ") || path.ends_with(" toml") || path.ends_with(" yaml"))
{
return -1.0;
}
0.0
}
fn add_stems(term: &str, out: &mut Vec<String>) {
for suffix in ["ing", "ed", "es", "s"] {
if term.len() > suffix.len() + 2 && term.ends_with(suffix) {
out.push(term[..term.len() - suffix.len()].to_string());
}
}
}
fn query_terms(query: &str) -> Vec<String> {
let mut terms: Vec<String> = normalize_text(query)
.split_whitespace()
.filter(|s| s.len() >= 3 && !STOP_WORDS.contains(s))
.map(str::to_string)
.collect();
let original = terms.clone();
for term in original {
add_stems(&term, &mut terms);
}
if terms.iter().any(|t| t == "missing") {
terms.push("not".to_string());
terms.push("found".to_string());
}
terms.sort();
terms.dedup();
terms
}
fn normalize_text(s: &str) -> String {
s.chars()
.map(|c| {
if c.is_ascii_alphanumeric() {
c.to_ascii_lowercase()
} else {
match c {
'\u{00c0}'..='\u{00c5}' | '\u{00e0}'..='\u{00e5}' => 'a',
'\u{00c7}' | '\u{00e7}' => 'c',
'\u{00c8}'..='\u{00cb}' | '\u{00e8}'..='\u{00eb}' => 'e',
'\u{00cc}'..='\u{00cf}' | '\u{00ec}'..='\u{00ef}' => 'i',
'\u{00d1}' | '\u{00f1}' => 'n',
'\u{00d2}'..='\u{00d6}' | '\u{00d8}' | '\u{00f2}'..='\u{00f6}' | '\u{00f8}' => {
'o'
}
'\u{00d9}'..='\u{00dc}' | '\u{00f9}'..='\u{00fc}' => 'u',
'\u{00dd}' | '\u{00fd}' | '\u{00ff}' => 'y',
_ => ' ',
}
}
})
.collect::<String>()
}
const STOP_WORDS: &[&str] = &[
"the", "and", "for", "with", "how", "does", "are", "from", "when", "what", "where", "into",
"this", "that", "was", "were", "has", "have", "can", "should",
];
pub fn format_results(results: &[SearchResult], query: &str) -> String {
format_results_with_limit(results, query, None)
}
fn format_results_with_limit(
results: &[SearchResult],
query: &str,
max_chunk_tokens: Option<usize>,
) -> String {
if results.is_empty() {
return format!("No relevant context found for: {}", query);
}
let mut parts = vec![format!(
"<!-- tokenix: {} chunks for '{}' -->",
results.len(),
query
)];
let mut by_file: HashMap<&str, Vec<&SearchResult>> = HashMap::new();
for r in results {
by_file.entry(&r.path).or_default().push(r);
}
let mut files: Vec<(&str, Vec<&SearchResult>)> = by_file.into_iter().collect();
files.sort_by_key(|(p, _)| *p);
for (path, mut chunks) in files {
chunks.sort_by_key(|c| c.start_line);
parts.push(format!("\n### {}", path));
for c in chunks {
let label = if c.symbol.is_empty() {
format!("L{}-{}", c.start_line, c.end_line)
} else {
format!("L{}-{} [{}] {}", c.start_line, c.end_line, c.kind, c.symbol)
};
parts.push(format!("``` {}", label));
parts.push(compact_content(&c.content, max_chunk_tokens));
parts.push("```".to_string());
}
}
let total_tokens: usize = results.iter().map(|r| r.token_count).sum();
parts.push(format!("\n<!-- {} tokens -->", total_tokens));
parts.join("\n")
}
fn compact_content(content: &str, max_tokens: Option<usize>) -> String {
let Some(max_tokens) = max_tokens else {
return content.to_string();
};
if count_tokens(content) <= max_tokens {
return content.to_string();
}
let mut out = Vec::new();
let mut used = 0usize;
for line in content.lines() {
let line_tokens = count_tokens(line);
if used + line_tokens > max_tokens {
break;
}
used += line_tokens;
out.push(line);
}
let omitted = content.lines().count().saturating_sub(out.len());
let mut compact = out.join("\n");
compact.push_str(&format!(
"\n// ... {omitted} line(s) omitted by tokenix budget"
));
compact
}
pub fn get_file_outline(file_path: &Path) -> Option<String> {
let content = std::fs::read_to_string(file_path).ok()?;
let path_str = file_path.to_string_lossy().replace('\\', "/");
Some(crate::chunker::generate_outline(&content, &path_str))
}
pub fn build_task_context(
repo_root: &Path,
task: &str,
budget: usize,
max_files: usize,
) -> Result<String> {
let search_budget = (budget * 2 / 3).max(500);
let result_count = if budget <= 1500 { 5 } else { 12 };
let results = query_index(repo_root, task, search_budget, result_count, None)?
.ok_or_else(|| anyhow!("Index not found. Please index the workspace first."))?;
if results.is_empty() {
return Ok(format!("No relevant context found for: {}", task));
}
let mut out = String::new();
out.push_str(&format!("<!-- tokenix_context: '{}' -->\n\n", task));
append_preferences(&mut out, repo_root, task, budget);
out.push_str("## Entry Points\n");
for result in results.iter().take(8) {
let symbol = if result.symbol.is_empty() {
"(file chunk)"
} else {
result.symbol.as_str()
};
out.push_str(&format!(
"- {}:{}-{} [{}] {}\n",
result.path, result.start_line, result.end_line, result.kind, symbol
));
}
out.push_str("\n## Relevant Source\n");
let max_chunk_tokens = if budget <= 1500 { Some(110) } else { None };
out.push_str(&format_results_with_limit(&results, task, max_chunk_tokens));
let mut paths = BTreeSet::new();
for result in &results {
paths.insert(result.path.clone());
if paths.len() >= max_files.max(1) {
break;
}
}
if budget <= 1500 {
return Ok(out);
}
out.push_str("\n\n## File Outlines\n");
let mut outline_tokens = crate::chunker::count_tokens(&out);
for path in paths {
if outline_tokens >= budget {
break;
}
let full = repo_root.join(&path);
if !full.exists() {
continue;
}
if let Some(outline) = get_file_outline(&full) {
let remaining = budget.saturating_sub(outline_tokens);
let outline_cost = crate::chunker::count_tokens(&outline);
if outline_cost > remaining {
out.push_str(&format!(
"\n### {}\n(outline omitted: budget exhausted)\n",
path
));
break;
}
out.push_str(&format!("\n### {}\n{}\n", path, outline));
outline_tokens += outline_cost;
}
}
Ok(out)
}
pub fn budget_breakdown(context: &str) -> Vec<(String, usize)> {
let mut sections: Vec<(String, String)> = Vec::new();
let mut current = String::from("(preamble)");
let mut body = String::new();
for line in context.lines() {
if let Some(title) = line.strip_prefix("## ") {
sections.push((std::mem::take(&mut current), std::mem::take(&mut body)));
current = title.trim().to_string();
}
body.push_str(line);
body.push('\n');
}
sections.push((current, body));
sections
.into_iter()
.filter(|(_, b)| !b.trim().is_empty())
.map(|(title, b)| (title, count_tokens(&b)))
.collect()
}
pub fn build_explore_context(
repo_root: &Path,
task: &str,
budget: usize,
max_symbols: usize,
) -> Result<String> {
let conn = open_db(repo_root, false)?
.ok_or_else(|| anyhow!("Index not found. Please index the workspace first."))?;
let seed_budget = (budget / 2).max(500);
let seeds = query_index(repo_root, task, seed_budget, max_symbols.max(4), None)?
.ok_or_else(|| anyhow!("Index not found. Please index the workspace first."))?;
if seeds.is_empty() {
return Ok(format!("No relevant context found for: {}", task));
}
let mut chunk_ids = Vec::new();
let mut seen_ids = HashSet::new();
let mut relation_lines = BTreeSet::new();
for seed in &seeds {
if seen_ids.insert(seed.id) {
chunk_ids.push(seed.id);
}
if seed.symbol.is_empty() {
continue;
}
for relation in crate::store::graph_callers(&conn, &seed.symbol, 4)?
.into_iter()
.chain(crate::store::graph_callees(&conn, &seed.symbol, 4)?)
{
relation_lines.insert(format!(
"- {}:{} [{}] {} -> {}:{} [{}] {} via `{}`",
relation.from.path,
relation.from.start_line,
relation.from.kind,
relation.from.name,
relation.to.path,
relation.to.start_line,
relation.to.kind,
relation.to.name,
relation.reference
));
for id in [relation.from.chunk_id, relation.to.chunk_id] {
if seen_ids.insert(id) && chunk_ids.len() < max_symbols.max(4) * 3 {
chunk_ids.push(id);
}
}
}
}
let chunks = crate::store::fetch_chunks_by_ids(&conn, &chunk_ids)?;
let mut out = String::new();
out.push_str(&format!("<!-- tokenix_explore: '{}' -->\n\n", task));
append_preferences(&mut out, repo_root, task, budget);
out.push_str("## Entry Points\n");
for seed in seeds.iter().take(max_symbols.max(1)) {
let symbol = if seed.symbol.is_empty() {
"(file chunk)"
} else {
seed.symbol.as_str()
};
out.push_str(&format!(
"- {}:{}-{} [{}] {}\n",
seed.path, seed.start_line, seed.end_line, seed.kind, symbol
));
}
out.push_str("\n## Relationship Map\n");
if relation_lines.is_empty() {
out.push_str("(no graph relationships found for the selected entry points)\n");
} else {
for line in relation_lines.iter().take(max_symbols.max(4) * 3) {
out.push_str(line);
out.push('\n');
}
}
out.push_str("\n## Source By File\n");
append_grouped_chunks(&mut out, &chunks, budget);
Ok(out)
}
fn append_preferences(out: &mut String, repo_root: &Path, task: &str, budget: usize) {
let preferences =
crate::memory::preferences_for_context(repo_root, task, 8).unwrap_or_default();
out.push_str("## Preference Memory\n");
if budget <= 1500 {
out.push_str("- Durable prefs: `tokenix_memory_add`; no secrets.\n");
if !preferences.is_empty() {
out.push_str("Saved preferences:\n");
out.push_str(&preferences);
out.push_str("\n\n");
} else {
out.push('\n');
}
return;
}
out.push_str("- If the user states a durable preference, migration decision, workflow rule, or project policy, save it with `tokenix_memory_add` when MCP is available.\n");
out.push_str("- Use `scope=project` for repository-specific rules and `scope=global` only for cross-repository preferences.\n");
out.push_str(
"- Do not save secrets, credentials, private tokens, one-off bug details, or guesses.\n",
);
if preferences.is_empty() {
out.push('\n');
return;
}
out.push_str("\nSaved preferences:\n");
out.push_str(&preferences);
out.push_str("\n\n");
}
fn append_grouped_chunks(out: &mut String, chunks: &[SearchResult], budget: usize) {
let mut by_file: HashMap<&str, Vec<&SearchResult>> = HashMap::new();
for chunk in chunks {
by_file.entry(&chunk.path).or_default().push(chunk);
}
let mut files: Vec<(&str, Vec<&SearchResult>)> = by_file.into_iter().collect();
files.sort_by_key(|(path, _)| *path);
for (path, mut file_chunks) in files {
if count_tokens(out) >= budget {
break;
}
file_chunks.sort_by_key(|chunk| chunk.start_line);
out.push_str(&format!("\n### {}\n", path));
for chunk in file_chunks {
let label = if chunk.symbol.is_empty() {
format!("L{}-{}", chunk.start_line, chunk.end_line)
} else {
format!(
"L{}-{} [{}] {}",
chunk.start_line, chunk.end_line, chunk.kind, chunk.symbol
)
};
let block = format!("``` {}\n{}\n```\n", label, chunk.content);
if count_tokens(out) + count_tokens(&block) > budget {
out.push_str("(remaining source omitted: budget exhausted)\n");
return;
}
out.push_str(&block);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::store::SearchResult;
fn make_result(
path: &str,
start: usize,
end: usize,
symbol: &str,
content: &str,
) -> SearchResult {
SearchResult {
id: 0,
path: path.to_string(),
start_line: start,
end_line: end,
symbol: symbol.to_string(),
kind: "fn".to_string(),
content: content.to_string(),
token_count: crate::chunker::count_tokens(content),
distance: 0.1,
}
}
#[test]
fn format_results_empty() {
let out = format_results(&[], "test query");
assert!(out.contains("No relevant context found"));
assert!(out.contains("test query"));
}
#[test]
fn budget_breakdown_splits_by_section() {
let ctx = "<!-- tokenix_context: 'x' -->\n\n## Preference Memory\n- a saved pref line\n\n## Entry Points\n- foo:1-2 [fn] foo\n\n## Relevant Source\nfn foo() { do_work() }\n";
let sections = budget_breakdown(ctx);
let names: Vec<&str> = sections.iter().map(|(n, _)| n.as_str()).collect();
assert!(names.contains(&"Preference Memory"));
assert!(names.contains(&"Entry Points"));
assert!(names.contains(&"Relevant Source"));
assert!(!names.iter().any(|n| n.starts_with('#')));
assert!(sections.iter().all(|(_, t)| *t > 0));
}
#[test]
fn format_results_groups_by_file() {
let results = vec![
make_result("src/auth.rs", 10, 30, "login", "fn login() {}"),
make_result("src/auth.rs", 50, 80, "logout", "fn logout() {}"),
make_result("src/db.rs", 1, 20, "connect", "fn connect() {}"),
];
let out = format_results(&results, "auth flow");
assert!(out.contains("### src/auth.rs"));
assert!(out.contains("### src/db.rs"));
assert!(out.contains("[fn] login"));
assert!(out.contains("[fn] logout"));
assert!(out.contains("[fn] connect"));
}
#[test]
fn format_results_header_includes_chunk_count() {
let results = vec![make_result("src/x.rs", 1, 5, "foo", "fn foo() {}")];
let out = format_results(&results, "foo function");
assert!(out.contains("1 chunks"));
}
#[test]
fn format_results_line_range_shown() {
let results = vec![make_result("src/x.rs", 42, 60, "", "some code here")];
let out = format_results(&results, "q");
assert!(out.contains("L42-60"));
}
#[test]
fn compact_content_truncates_large_blocks() {
let content = "let value = compute();\n".repeat(80);
let compact = compact_content(&content, Some(30));
assert!(compact.contains("omitted by tokenix budget"));
assert!(count_tokens(&compact) < count_tokens(&content));
}
#[test]
fn rerank_prefers_hook_fail_open_implementation() {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(
"src/store.rs",
1,
20,
"index_staleness",
"pub fn index_staleness() { stale missing index }",
)
},
SearchResult {
distance: 0.20,
..make_result(
"src/hook.rs",
327,
407,
"run_hook",
"pub fn run_hook() { if staleness.stale { std::process::exit(0); } action pass }",
)
},
];
rerank_results(
&mut results,
"how does hook fail open when index is stale or missing",
);
assert_eq!(results[0].path, "src/hook.rs");
assert_eq!(results[0].symbol, "run_hook");
}
#[test]
fn rerank_prefers_pre_hook_staleness_over_post_hook_compression() {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(
"src/compress.rs",
461,
516,
"run_hook_post",
"pub fn run_hook_post() { std::process::exit(0); log_hook_event saved_tokens }",
)
},
SearchResult {
distance: 0.16,
..make_result(
"src/hook.rs",
327,
407,
"run_hook",
"pub fn run_hook() { let staleness = index_staleness(); if staleness.stale { std::process::exit(0); } action pass }",
)
},
];
rerank_results(
&mut results,
"how does hook fail open when index is stale or missing",
);
assert_eq!(results[0].path, "src/hook.rs");
assert_eq!(results[0].symbol, "run_hook");
}
#[test]
fn rerank_prefers_store_for_sqlite_cosine_search() {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(
"src/embed.rs",
152,
169,
"similar_texts_have_higher_cosine_similarity",
"cosine similarity embedding query vector",
)
},
SearchResult {
distance: 0.16,
..make_result(
"src/store.rs",
523,
551,
"cosine_similarity_to_bytes",
"sqlite embeddings chunk_id cosine_similarity_to_bytes vector search",
)
},
];
rerank_results(
&mut results,
"how is cosine similarity search implemented in sqlite",
);
assert_eq!(results[0].path, "src/store.rs");
}
#[test]
fn rerank_penalizes_unit_test_leakage_for_code_queries() {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(
"src/query.rs",
728,
758,
"rerank_prefers_hook_fail_open_implementation",
"assert_eq hook fail open stale missing index",
)
},
SearchResult {
distance: 0.16,
..make_result(
"src/hook.rs",
327,
407,
"run_hook",
"index_staleness staleness stale std::process::exit(0) action pass",
)
},
];
rerank_results(
&mut results,
"how does hook fail open when index is stale or missing",
);
assert_eq!(results[0].path, "src/hook.rs");
}
#[test]
fn rerank_prefers_gain_for_hook_log_savings_analytics() {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(
"src/hook.rs",
327,
407,
"run_hook",
"log_hook_event saved_tokens actual_tokens original_estimate",
)
},
SearchResult {
distance: 0.16,
..make_result(
"src/gain.rs",
75,
135,
"compute_gain",
"compute_gain read_hook_log tokens_saved original_estimate hook log",
)
},
];
rerank_results(
&mut results,
"how are token savings calculated from hook log",
);
assert_eq!(results[0].path, "src/gain.rs");
assert_eq!(results[0].symbol, "compute_gain");
}
#[test]
fn rerank_prefers_compress_for_cargo_output_errors() {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(
"src/filters.rs",
1,
80,
"apply_filter",
"output filters keep lines matching error warning",
)
},
SearchResult {
distance: 0.16,
..make_result(
"src/compress.rs",
78,
120,
"compress_cargo",
"compress cargo output keep errors warnings diagnostics",
)
},
];
rerank_results(
&mut results,
"how does cargo output compression keep errors",
);
assert_eq!(results[0].path, "src/compress.rs");
assert_eq!(results[0].symbol, "compress_cargo");
}
#[test]
fn rerank_penalizes_benchmark_fixture_leakage() {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(
"src/benchmark.rs",
1,
20,
"measure_semantic_quality",
"postgres transaction pool user repository pagination expected_paths",
)
},
SearchResult {
distance: 0.15,
..make_result(
"benchmark/samples/database_client.ts",
175,
248,
"UserRepository",
"postgres transaction pool user repository pagination",
)
},
];
rerank_results(
&mut results,
"postgres transaction pool user repository pagination",
);
assert_eq!(results[0].path, "benchmark/samples/database_client.ts");
}
#[test]
fn rerank_prefers_code_over_docs_for_implementation_queries() {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(
"AGENTS.md",
1,
40,
"",
"Rust files chunked into symbols and outlines for agents",
)
},
SearchResult {
distance: 0.18,
..make_result(
"src/chunker.rs",
223,
225,
"chunk_rust",
"fn chunk_rust content symbol outline rust chunk",
)
},
];
rerank_results(
&mut results,
"how are rust files chunked into symbols and outlines",
);
assert_eq!(results[0].path, "src/chunker.rs");
}
#[test]
fn rerank_prefers_chunker_over_indexer_for_chunking_queries() {
let mut results = vec![
SearchResult {
distance: 0.02,
..make_result(
"src/indexer.rs",
88,
119,
"walk_indexable_files",
"walk files should_index index repository source files",
)
},
SearchResult {
distance: 0.24,
..make_result(
"src/chunker.rs",
292,
308,
"chunk_file",
"fn chunk_file rust files symbols outlines generate_outline symbol aware chunk_rust",
)
},
];
rerank_results(
&mut results,
"how are rust files chunked into symbols and outlines",
);
assert_eq!(results[0].path, "src/chunker.rs");
}
#[test]
fn rerank_penalizes_filter_assets_for_code_queries() {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(
"assets/filters/rsync.toml",
1,
48,
"",
"failed missing error compact output",
)
},
SearchResult {
distance: 0.18,
..make_result(
"src/hook.rs",
326,
406,
"run_hook",
"hook staleness stale index exit 0 action pass",
)
},
];
rerank_results(
&mut results,
"how does hook fail open when index is stale or missing",
);
assert_eq!(results[0].path, "src/hook.rs");
}
#[test]
fn rerank_prefers_project_intent_matches() {
let cases = [
(
"pwa mobile capacitor tests links app manifest build",
(
"tests/test_links.py",
"test_links",
"pytest validates public links",
),
(
"tests/test_pwa.py",
"test_pwa_manifest",
"pytest pwa mobile capacitor manifest service worker build",
),
),
(
"geracao de arte imagem capitulo personagens local art pipeline",
(
"Cronologia/analise_capitulo_122.md",
"",
"analise capitulo personagens arte imagem narrativa",
),
(
"scripts/art_gen/local_pipeline.py",
"LocalPipelineManager",
"local pipeline image generator personagens chapter art",
),
),
(
"analise de coerencia narrativa capitulos cronologia fumo Gabo recomendacoes",
(
"Cronologia/analise_capitulo_121.md",
"",
"analise capitulo fumo gabo evidencia narrativa",
),
(
"docs/analise_coerencia.md",
"",
"analise coerencia relatorio recomendacoes narrativa fumo gabo",
),
),
(
"capitulo 133 materia prima personagens gabo valeria aria rangel taxidermista",
(
"scripts/generate_chapter_art.py",
"generate_chapter_art",
"capitulo personagens gabo valeria aria rangel art generator",
),
(
"docs/public/capitulo-133.md",
"",
"capitulo 133 materia prima personagens gabo valeria aria rangel taxidermista",
),
),
];
for (query, close_semantic, expected) in cases {
let mut results = vec![
SearchResult {
distance: 0.01,
..make_result(close_semantic.0, 1, 80, close_semantic.1, close_semantic.2)
},
SearchResult {
distance: 0.24,
..make_result(expected.0, 1, 90, expected.1, expected.2)
},
];
rerank_results(&mut results, query);
assert_eq!(results[0].path, expected.0, "query: {query}");
}
}
#[test]
fn query_terms_fold_latin_accents() {
let terms = query_terms(
"cap\u{00ed}tulo mat\u{00e9}ria val\u{00e9}ria an\u{00e1}lise coer\u{00ea}ncia recomenda\u{00e7}\u{00f5}es",
);
assert!(terms.contains(&"capitulo".to_string()));
assert!(terms.contains(&"materia".to_string()));
assert!(terms.contains(&"valeria".to_string()));
assert!(terms.contains(&"analise".to_string()));
assert!(terms.contains(&"coerencia".to_string()));
assert!(terms.contains(&"recomendacoes".to_string()));
}
}