use anyhow::Result;
use colored::Colorize;
use serde::Serialize;
use serde_json::Value;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use walkdir::WalkDir;
use crate::chunker::count_tokens;
use crate::filters;
use crate::ui;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Agent {
Claude,
Codex,
Copilot,
OpenAi,
All,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize)]
#[serde(rename_all = "kebab-case")]
pub enum Scenario {
FullRead,
LargeCommandOutput,
BootstrapPrompt,
HookMetadata,
ToolSchema,
DiffDump,
TestLog,
TaskContextBlob,
AgentLogBlob,
ImageBlob,
ConnectorJsonBlob,
PatchBlob,
BuildArtifactBlob,
SignatureBlob,
DocumentationBlob,
UnknownLarge,
}
impl Scenario {
fn label(self) -> &'static str {
match self {
Scenario::FullRead => "full-read",
Scenario::LargeCommandOutput => "large-command-output",
Scenario::BootstrapPrompt => "bootstrap-prompt",
Scenario::HookMetadata => "hook-metadata",
Scenario::ToolSchema => "tool-schema",
Scenario::DiffDump => "diff-dump",
Scenario::TestLog => "test-log",
Scenario::TaskContextBlob => "task-context-blob",
Scenario::AgentLogBlob => "agent-log-blob",
Scenario::ImageBlob => "image-blob",
Scenario::ConnectorJsonBlob => "connector-json-blob",
Scenario::PatchBlob => "patch-blob",
Scenario::BuildArtifactBlob => "build-artifact-blob",
Scenario::SignatureBlob => "signature-blob",
Scenario::DocumentationBlob => "documentation-blob",
Scenario::UnknownLarge => "unknown-large",
}
}
fn recommendation(self) -> &'static str {
match self {
Scenario::FullRead => {
"Use tokenix read/query first; then symbol or offset/limit reads."
}
Scenario::LargeCommandOutput => {
"Route command through tokenix run and add/adjust an output filter."
}
Scenario::BootstrapPrompt => {
"Reduce installed skills/tools/MCP prompt weight; check prompt-audit."
}
Scenario::HookMetadata => {
"Keep hook success payloads minimal and avoid duplicated updatedInput fields."
}
Scenario::ToolSchema => "Disable unused MCP servers or switch to a slim MCP profile.",
Scenario::DiffDump => "Use git diff --stat/name-only first, then targeted hunks.",
Scenario::TestLog => {
"Keep failures, summaries, and first diagnostics; collapse passing noise."
}
Scenario::TaskContextBlob => {
"Pass task metadata/artifacts as indexed references, not full embedded blobs."
}
Scenario::AgentLogBlob => {
"Filter logs before returning them; keep errors plus bounded head/tail windows."
}
Scenario::ImageBlob => {
"Store image attachments by path/hash; do not replay base64 into context."
}
Scenario::ConnectorJsonBlob => {
"Request compact fields/pages from connectors; avoid returning full API payloads."
}
Scenario::PatchBlob => {
"Prefer file patch references or bounded hunks over full patch payloads."
}
Scenario::BuildArtifactBlob => {
"Avoid returning generated/minified build artifacts; inspect sources instead."
}
Scenario::SignatureBlob => {
"Do not replay provider signature/provenance fields into assistant context."
}
Scenario::DocumentationBlob => {
"Use documentation indexes/URLs as references, then fetch only needed pages."
}
Scenario::UnknownLarge => {
"Inspect the preview and add a filter or hook rule for this shape."
}
}
}
}
#[derive(Clone, Debug, Serialize)]
pub struct Finding {
pub agent: String,
pub scenario: Scenario,
pub file: String,
pub line: usize,
pub json_path: String,
pub chars: usize,
pub tokens: usize,
pub tool: Option<String>,
pub command: Option<String>,
pub filter: Option<String>,
pub preview: String,
}
#[derive(Clone, Debug, Default, Serialize)]
struct ScenarioSummary {
scenario: String,
count: usize,
chars: usize,
tokens: usize,
recommendation: String,
}
#[derive(Clone, Debug, Serialize)]
struct Report {
min_chars: usize,
scanned_files: usize,
findings: Vec<Finding>,
scenarios: Vec<ScenarioSummary>,
}
pub struct Options {
pub agent: Agent,
pub min_chars: usize,
pub limit: usize,
pub json: bool,
}
pub fn run(options: Options) -> Result<()> {
let report = audit(options.agent, options.min_chars, options.limit)?;
if options.json {
println!("{}", serde_json::to_string_pretty(&report)?);
} else {
print_human(&report);
}
Ok(())
}
fn audit(agent: Agent, min_chars: usize, limit: usize) -> Result<Report> {
let home = dirs::home_dir().unwrap_or_else(|| PathBuf::from("."));
let filters = filters::load_all_filters();
let mut scanned_files = 0usize;
let mut findings = Vec::new();
for (agent_key, root) in roots(&home, agent) {
if !root.exists() {
continue;
}
for path in transcript_files(&root, agent_key) {
scanned_files += 1;
scan_jsonl_file(&path, agent_key, min_chars, &filters, &mut findings);
scan_text_file(&path, agent_key, min_chars, &filters, &mut findings);
}
}
if matches!(agent, Agent::All | Agent::Copilot) {
scan_copilot_sqlite(&home, min_chars, &filters, &mut findings)?;
}
findings.sort_by_key(|f| std::cmp::Reverse(f.chars));
if findings.len() > limit {
findings.truncate(limit);
}
let scenarios = summarize(&findings);
Ok(Report {
min_chars,
scanned_files,
findings,
scenarios,
})
}
fn roots(home: &Path, agent: Agent) -> Vec<(&'static str, PathBuf)> {
let mut out = Vec::new();
if matches!(agent, Agent::All | Agent::Claude) {
out.push(("claude", home.join(".claude").join("projects")));
}
if matches!(agent, Agent::All | Agent::Codex) {
out.push(("codex", home.join(".codex").join("sessions")));
}
if matches!(agent, Agent::All | Agent::Copilot) {
out.push(("copilot", home.join(".copilot").join("session-state")));
out.push(("copilot", home.join(".copilot").join("logs")));
}
if matches!(agent, Agent::All | Agent::OpenAi) {
out.push(("openai", home.join(".openai")));
}
out
}
fn transcript_files(root: &Path, agent: &str) -> Vec<PathBuf> {
WalkDir::new(root)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
.filter_map(|e| {
let p = e.into_path();
let ext = p.extension().and_then(|x| x.to_str()).unwrap_or("");
let name = p.file_name().and_then(|x| x.to_str()).unwrap_or("");
let keep = matches!(ext, "jsonl" | "json" | "log" | "txt")
&& !(agent == "copilot"
&& p.components().any(|c| c.as_os_str() == "pkg")
&& !name.contains("session"));
keep.then_some(p)
})
.collect()
}
fn scan_jsonl_file(
path: &Path,
agent: &str,
min_chars: usize,
filters: &[filters::FilterDef],
findings: &mut Vec<Finding>,
) {
let Ok(raw) = std::fs::read_to_string(path) else {
return;
};
let mut tool_map: HashMap<String, (String, Option<String>)> = HashMap::new();
if path.extension().and_then(|x| x.to_str()) == Some("json") {
if let Ok(value) = serde_json::from_str::<Value>(&raw) {
collect_tool_calls(&value, &mut tool_map);
collect_large_strings(
&value,
String::new(),
path,
1,
agent,
min_chars,
&tool_map,
filters,
findings,
);
return;
}
}
for (idx, line) in raw.lines().enumerate() {
let line_no = idx + 1;
let Ok(value) = serde_json::from_str::<Value>(line) else {
continue;
};
collect_tool_calls(&value, &mut tool_map);
collect_large_strings(
&value,
String::new(),
path,
line_no,
agent,
min_chars,
&tool_map,
filters,
findings,
);
}
}
fn scan_text_file(
path: &Path,
agent: &str,
min_chars: usize,
filters: &[filters::FilterDef],
findings: &mut Vec<Finding>,
) {
let ext = path.extension().and_then(|x| x.to_str()).unwrap_or("");
if matches!(ext, "jsonl" | "json") {
return;
}
let Ok(raw) = std::fs::read_to_string(path) else {
return;
};
if raw.len() < min_chars {
return;
}
push_finding(
agent,
classify("", &raw, None),
path,
1,
"",
None,
None,
filters,
&raw,
findings,
);
}
#[allow(clippy::too_many_arguments)]
fn collect_large_strings(
value: &Value,
path_key: String,
file: &Path,
line: usize,
agent: &str,
min_chars: usize,
tool_map: &HashMap<String, (String, Option<String>)>,
filters: &[filters::FilterDef],
findings: &mut Vec<Finding>,
) {
match value {
Value::String(s) if s.len() >= min_chars => {
let tool = tool_from_path(&path_key);
let command = None;
let scenario = classify(&path_key, s, command.as_deref().or(tool.as_deref()));
push_finding(
agent, scenario, file, line, &path_key, tool, command, filters, s, findings,
);
}
Value::String(_) => {}
Value::Array(items) => {
for (i, item) in items.iter().enumerate() {
collect_large_strings(
item,
format!("{path_key}/{i}"),
file,
line,
agent,
min_chars,
tool_map,
filters,
findings,
);
}
}
Value::Object(map) => {
if push_tool_output_if_large(
map, file, line, agent, min_chars, tool_map, filters, findings,
) {
return;
}
for (k, v) in map {
collect_large_strings(
v,
format!("{path_key}/{k}"),
file,
line,
agent,
min_chars,
tool_map,
filters,
findings,
);
}
}
_ => {}
}
}
#[allow(clippy::too_many_arguments)]
fn push_tool_output_if_large(
map: &serde_json::Map<String, Value>,
file: &Path,
line: usize,
agent: &str,
min_chars: usize,
tool_map: &HashMap<String, (String, Option<String>)>,
filters: &[filters::FilterDef],
findings: &mut Vec<Finding>,
) -> bool {
let kind = map.get("type").and_then(Value::as_str);
if kind == Some("response_item") {
if let Some(Value::Object(payload)) = map.get("payload") {
return push_tool_output_if_large(
payload, file, line, agent, min_chars, tool_map, filters, findings,
);
}
}
let (id_key, text_key) = match kind {
Some("function_call_output") => ("call_id", "output"),
Some("tool_result") => ("tool_use_id", "content"),
_ => return false,
};
let Some(text) = map.get(text_key).and_then(Value::as_str) else {
return false;
};
if text.len() < min_chars {
return false;
}
let mapped = map
.get(id_key)
.and_then(Value::as_str)
.and_then(|id| tool_map.get(id));
let tool = mapped
.map(|(t, _)| t.clone())
.or_else(|| Some("tool-output".to_string()));
let command = mapped.and_then(|(_, c)| c.clone());
let scenario = classify(text_key, text, command.as_deref().or(tool.as_deref()));
push_finding(
agent, scenario, file, line, text_key, tool, command, filters, text, findings,
);
true
}
fn collect_tool_calls(value: &Value, tool_map: &mut HashMap<String, (String, Option<String>)>) {
if let Value::Object(map) = value {
let kind = map.get("type").and_then(Value::as_str);
if kind == Some("response_item") {
if let Some(payload) = map.get("payload") {
collect_tool_calls(payload, tool_map);
}
}
if kind == Some("function_call") {
if let Some(id) = map.get("call_id").and_then(Value::as_str) {
let name = map
.get("name")
.and_then(Value::as_str)
.unwrap_or("function_call")
.to_string();
let command = map
.get("arguments")
.and_then(Value::as_str)
.and_then(extract_command_arg);
tool_map.insert(id.to_string(), (name, command));
}
}
if kind == Some("tool_use") {
if let Some(id) = map.get("id").and_then(Value::as_str) {
let name = map
.get("name")
.and_then(Value::as_str)
.unwrap_or("tool_use")
.to_string();
let command = map
.get("input")
.and_then(|input| input.get("command"))
.and_then(Value::as_str)
.map(str::to_string);
tool_map.insert(id.to_string(), (name, command));
}
}
for v in map.values() {
collect_tool_calls(v, tool_map);
}
}
}
fn extract_command_arg(raw: &str) -> Option<String> {
serde_json::from_str::<Value>(raw).ok().and_then(|v| {
v.get("command")
.or_else(|| v.get("CommandLine"))
.or_else(|| v.get("commandLine"))
.and_then(Value::as_str)
.map(str::to_string)
})
}
fn tool_from_path(path_key: &str) -> Option<String> {
let lower = path_key.to_ascii_lowercase();
if lower.contains("tool_result") || lower.contains("function_call_output") {
Some("tool-output".to_string())
} else if lower.contains("base_instructions") || lower.contains("developer_instructions") {
Some("system-prompt".to_string())
} else if lower.contains("skill_listing") {
Some("skill-listing".to_string())
} else {
None
}
}
fn classify(path_key: &str, text: &str, command_or_tool: Option<&str>) -> Scenario {
let lower_path = path_key.to_ascii_lowercase();
let lower_text = text
.chars()
.take(600)
.collect::<String>()
.to_ascii_lowercase();
let lower_cmd = command_or_tool.unwrap_or("").to_ascii_lowercase();
if lower_path.contains("base_instructions")
|| lower_path.contains("developer_instructions")
|| lower_path.contains("skill_listing")
|| lower_text.contains("# desired oververbosity")
|| lower_text.contains("you are codex")
|| lower_text.contains("base directory for this skill:")
|| lower_text.contains("# update config skill")
|| lower_text.contains("project mika")
|| lower_text.contains("you output only a thread title")
{
return Scenario::BootstrapPrompt;
}
if lower_path.contains("signature") {
return Scenario::SignatureBlob;
}
if lower_text.starts_with("data:image/")
|| lower_text.starts_with("/9j/")
|| lower_path.contains("image_url")
|| lower_path.contains("/source/data")
|| lower_path.contains("/file/base64")
{
return Scenario::ImageBlob;
}
if lower_text.contains("documentation index")
|| lower_text.contains("llms.txt")
|| lower_text.contains("use this file to discover all available pages")
{
return Scenario::DocumentationBlob;
}
if lower_text.contains("<div id=\"app\"")
|| lower_text.contains("<!doctype html")
|| lower_text.contains(".vitepress/dist/")
|| lower_text.contains("diagramcode")
|| lower_path.contains("rawsvg")
{
return Scenario::BuildArtifactBlob;
}
if lower_path.contains("/payload/result/ok/content")
&& (lower_text.starts_with("{\"issues\"")
|| lower_text.starts_with("{\"repositories\"")
|| lower_text.starts_with("{\"data\""))
{
return Scenario::ConnectorJsonBlob;
}
if lower_path == "output"
&& lower_text.contains(" output: {\"")
&& (lower_text.contains("\"issues\"")
|| lower_text.contains("\"repositories\"")
|| lower_text.contains("\"data\""))
{
return Scenario::ConnectorJsonBlob;
}
if lower_text.starts_with("{\"data\":")
&& (lower_text.contains("\"enabled\"")
|| lower_text.contains("\"authenticated\"")
|| lower_text.contains("\"loginurl\""))
{
return Scenario::ConnectorJsonBlob;
}
if lower_path.contains("/payload/input")
&& (lower_text.contains("*** begin patch")
|| lower_text.contains("*** update file")
|| lower_text.contains("*** add file"))
{
return Scenario::PatchBlob;
}
if lower_text.contains("=== task detail ===")
|| lower_text.contains("=== task artifacts ===")
|| lower_text.contains("context bundle")
|| lower_path.contains("task_artifact")
|| lower_path.contains("task_detail")
{
return Scenario::TaskContextBlob;
}
if lower_text.contains("litellm proxy:")
|| lower_text.contains("---log snapshot---")
|| lower_text.contains("=== homolog start ===")
|| lower_text.contains("/.local/share/opencode/log/")
|| lower_text.contains("defaulted container")
|| lower_text.contains("select-string :")
|| lower_text.starts_with("error 20")
|| lower_text.contains("service=llm")
{
return Scenario::AgentLogBlob;
}
if lower_text.contains("\"hookspecificoutput\"")
|| lower_text.contains("permissiondecision")
|| lower_path.contains("hook_success")
{
return Scenario::HookMetadata;
}
if lower_text.contains("\"input_schema\"")
|| lower_text.contains("\"tool\"")
|| lower_text.contains("mcp")
{
return Scenario::ToolSchema;
}
if lower_cmd.contains("git diff")
|| lower_text.contains("diff --git ")
|| lower_text.contains("@@")
{
return Scenario::DiffDump;
}
if lower_text.starts_with("grep:") {
return Scenario::LargeCommandOutput;
}
if lower_text.contains("terraform")
|| lower_text.contains("refreshing state")
|| lower_text.contains("module.")
{
return Scenario::LargeCommandOutput;
}
if lower_cmd.contains("test")
|| lower_text.contains("running ")
|| lower_text.contains("test result:")
|| lower_text.contains("turbo run test")
|| lower_text.contains("failed")
{
return Scenario::TestLog;
}
if lower_cmd.contains("kubectl")
|| lower_cmd.contains("git ")
|| lower_cmd.contains("npm ")
|| lower_cmd.contains("pnpm ")
|| lower_cmd.contains("cargo ")
|| lower_cmd.contains("node ")
|| lower_cmd.contains("tree")
|| lower_cmd.contains("gh run")
|| lower_path.contains("function_call_output")
|| lower_path.contains("tool_result")
{
return Scenario::LargeCommandOutput;
}
if looks_like_numbered_file(text) || lower_cmd == "read" || lower_cmd.contains("read") {
return Scenario::FullRead;
}
Scenario::UnknownLarge
}
fn looks_like_numbered_file(text: &str) -> bool {
let mut numbered = 0usize;
for line in text.lines().take(30) {
let t = line.trim_start();
let digits = t.chars().take_while(|c| c.is_ascii_digit()).count();
if digits > 0 && t.chars().nth(digits).is_some_and(|c| c == '\t' || c == ' ') {
numbered += 1;
}
}
numbered >= 5
}
#[allow(clippy::too_many_arguments)]
fn push_finding(
agent: &str,
scenario: Scenario,
file: &Path,
line: usize,
json_path: &str,
tool: Option<String>,
command: Option<String>,
filters: &[filters::FilterDef],
text: &str,
findings: &mut Vec<Finding>,
) {
let filter = command
.as_deref()
.and_then(|cmd| filters::find_filter(cmd, filters))
.and_then(|f| f.description.clone())
.or_else(|| {
command
.as_deref()
.and_then(|cmd| filters::find_filter(cmd, filters))
.map(|_| "matched filter".to_string())
});
findings.push(Finding {
agent: agent.to_string(),
scenario,
file: file.display().to_string(),
line,
json_path: json_path.to_string(),
chars: text.len(),
tokens: count_tokens(text),
tool,
command,
filter,
preview: preview(text),
});
}
fn preview(text: &str) -> String {
text.chars()
.take(180)
.collect::<String>()
.replace(['\r', '\n', '\t'], " ")
}
fn summarize(findings: &[Finding]) -> Vec<ScenarioSummary> {
let mut by: HashMap<Scenario, ScenarioSummary> = HashMap::new();
for f in findings {
let entry = by.entry(f.scenario).or_insert_with(|| ScenarioSummary {
scenario: f.scenario.label().to_string(),
recommendation: f.scenario.recommendation().to_string(),
..ScenarioSummary::default()
});
entry.count += 1;
entry.chars += f.chars;
entry.tokens += f.tokens;
}
let mut out: Vec<_> = by.into_values().collect();
out.sort_by_key(|s| std::cmp::Reverse(s.tokens));
out
}
fn print_human(report: &Report) {
ui::box_header("conversation-audit");
println!(
" scanned: {} files · min {} chars · findings {}",
report.scanned_files,
report.min_chars,
report.findings.len()
);
println!();
for s in &report.scenarios {
println!(
" {:<22} {:>4} hits {:>8} tokens {}",
s.scenario.cyan(),
s.count,
crate::ui::format_num(s.tokens as i64).yellow(),
s.recommendation.dimmed()
);
}
if !report.findings.is_empty() {
println!("\n{}", "top findings".bold());
}
for f in report.findings.iter().take(20) {
let filter = f
.filter
.as_deref()
.map(|x| format!(" filter={x}"))
.unwrap_or_default();
let command = f
.command
.as_deref()
.map(|x| format!(" cmd={}", trim(x, 76)))
.unwrap_or_default();
println!(
" {} {} tokens={} chars={}{}{}",
f.agent.dimmed(),
f.scenario.label().bold(),
crate::ui::format_num(f.tokens as i64),
crate::ui::format_num(f.chars as i64),
filter.dimmed(),
command.dimmed()
);
println!(" {}:{}", trim(&f.file, 100), f.line);
if !f.json_path.is_empty() {
println!(" path={}", trim(&f.json_path, 100).dimmed());
}
println!(" {}", f.preview.dimmed());
}
}
fn trim(s: &str, max: usize) -> String {
if s.chars().count() <= max {
return s.to_string();
}
let mut out = s.chars().take(max.saturating_sub(3)).collect::<String>();
out.push_str("...");
out
}
fn scan_copilot_sqlite(
home: &Path,
min_chars: usize,
filters: &[filters::FilterDef],
findings: &mut Vec<Finding>,
) -> Result<()> {
let Some(db) = copilot_db_path(home) else {
return Ok(());
};
let conn = match rusqlite::Connection::open_with_flags(
&db,
rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY,
) {
Ok(c) => c,
Err(_) => return Ok(()),
};
scan_copilot_table(
&conn,
&db,
"turns",
&["user_message", "assistant_response"],
min_chars,
filters,
findings,
);
scan_copilot_table(
&conn,
&db,
"checkpoints",
&["title", "overview", "history", "work_done"],
min_chars,
filters,
findings,
);
Ok(())
}
fn copilot_db_path(home: &Path) -> Option<PathBuf> {
if let Some(appdata) = std::env::var_os("APPDATA").map(PathBuf::from) {
let db = appdata
.join("Code")
.join("User")
.join("globalStorage")
.join("github.copilot-chat")
.join("session-store.db");
if db.exists() {
return Some(db);
}
}
let db = home.join(".copilot").join("session-store.db");
db.exists().then_some(db)
}
fn scan_copilot_table(
conn: &rusqlite::Connection,
db: &Path,
table: &str,
columns: &[&str],
min_chars: usize,
filters: &[filters::FilterDef],
findings: &mut Vec<Finding>,
) {
let Ok(mut stmt) = conn.prepare(&format!("SELECT {} FROM {table}", columns.join(","))) else {
return;
};
let Ok(rows) = stmt.query_map([], |row| {
let mut values = Vec::new();
for i in 0..columns.len() {
values.push(row.get::<_, Option<String>>(i).unwrap_or_default());
}
Ok(values)
}) else {
return;
};
for (row_idx, row) in rows.filter_map(|r| r.ok()).enumerate() {
for text in row.into_iter().flatten() {
if text.len() < min_chars {
continue;
}
let scenario = classify(table, &text, None);
push_finding(
"copilot",
scenario,
db,
row_idx + 1,
table,
Some(table.to_string()),
None,
filters,
&text,
findings,
);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn classifies_full_read_from_numbered_lines() {
let text = (1..=10)
.map(|i| format!("{i}\tfn item_{i}() {{}}"))
.collect::<Vec<_>>()
.join("\n");
assert_eq!(classify("", &text, Some("Read")), Scenario::FullRead);
}
#[test]
fn classifies_bootstrap_prompt() {
assert_eq!(
classify("/payload/base_instructions/text", "You are Codex", None),
Scenario::BootstrapPrompt
);
}
#[test]
fn classifies_command_output_from_call_output_path() {
assert_eq!(
classify(
"/payload/output",
"kubectl get pods\n".repeat(400).as_str(),
Some("kubectl get pods")
),
Scenario::LargeCommandOutput
);
}
}