use std::collections::hash_map::DefaultHasher;
use std::collections::HashSet;
use std::hash::{Hash, Hasher};
use std::path::Path;
use sqz_engine::{format_command, CompressedContent, DependencyMapper, NgramAbbreviator, SqzEngine};
pub const CLI_PATTERNS: &[&str] = &[
"git", "hg", "svn", "fossil",
"cargo", "make", "cmake", "ninja", "bazel", "buck", "gradle", "mvn",
"ant", "sbt", "lein", "mix", "rebar3",
"npm", "yarn", "pnpm", "bun", "pip", "pip3", "poetry", "pipenv",
"conda", "gem", "bundle", "composer", "go", "dep", "glide",
"apt", "apt-get", "dpkg", "yum", "dnf", "rpm", "pacman", "brew",
"port", "snap", "flatpak", "nix", "guix",
"docker", "podman", "buildah", "skopeo", "kubectl", "helm", "k9s",
"minikube", "kind", "k3s", "nomad", "consul", "vault",
"aws", "az", "gcloud", "gsutil", "terraform", "pulumi", "cdk",
"serverless", "sam",
"node", "deno", "python", "python3", "ruby", "java", "kotlin",
"scala", "clojure", "elixir", "erlang", "ghc", "rustc", "clang",
"gcc", "g++",
"jest", "mocha", "pytest", "rspec", "minitest", "phpunit", "vitest",
"cypress", "playwright",
"eslint", "tslint", "prettier", "black", "isort", "flake8", "mypy",
"pylint", "rubocop", "golangci-lint", "clippy", "rustfmt",
"curl", "wget", "ssh", "scp", "rsync", "nc", "netstat", "ss",
"ping", "traceroute", "dig", "nslookup", "openssl",
"find", "grep", "rg", "ag", "fd", "ls", "tree", "cat", "less",
"head", "tail", "wc", "sort", "uniq", "awk", "sed", "jq", "yq",
"psql", "mysql", "sqlite3", "mongo", "redis-cli", "influx",
"gh", "hub", "lab", "glab", "jira", "linear",
"ansible", "chef", "puppet", "salt",
"ffmpeg", "convert", "identify",
];
fn content_hash(content: &str) -> u64 {
let mut hasher = DefaultHasher::new();
content.hash(&mut hasher);
hasher.finish()
}
#[allow(dead_code)]
struct CacheEntry {
hash: u64,
tokens_original: u32,
}
pub struct CliProxy {
engine: SqzEngine,
l1_cache: std::cell::RefCell<HashSet<u64>>,
dep_mapper: std::cell::RefCell<DependencyMapper>,
abbreviator: std::cell::RefCell<NgramAbbreviator>,
}
impl CliProxy {
pub fn new() -> sqz_engine::Result<Self> {
let engine = SqzEngine::new()?;
Ok(Self {
engine,
l1_cache: std::cell::RefCell::new(HashSet::new()),
dep_mapper: std::cell::RefCell::new(DependencyMapper::new()),
abbreviator: std::cell::RefCell::new(NgramAbbreviator::new()),
})
}
#[allow(dead_code)]
pub fn with_engine(engine: SqzEngine) -> Self {
Self {
engine,
l1_cache: std::cell::RefCell::new(HashSet::new()),
dep_mapper: std::cell::RefCell::new(DependencyMapper::new()),
abbreviator: std::cell::RefCell::new(NgramAbbreviator::new()),
}
}
pub fn intercept_output(&self, cmd: &str, output: &str) -> String {
self.track_file(cmd, output);
self.engine.cache_manager().advance_turn();
let fast_hash = content_hash(output);
if self.l1_cache.borrow().contains(&fast_hash) {
if let Ok(Some(inline_ref)) = self.engine.cache_manager().check_dedup(output.as_bytes()) {
eprintln!("[sqz] dedup hit: {} (L1+L2)", inline_ref);
return inline_ref;
}
}
if let Ok(Some(inline_ref)) = self.engine.cache_manager().check_dedup(output.as_bytes()) {
self.l1_cache.borrow_mut().insert(fast_hash);
eprintln!("[sqz] dedup hit: {} (L2)", inline_ref);
return inline_ref;
}
if let Some(formatted) = format_command(cmd, output) {
let tokens_original = (output.len() as u32 + 3) / 4;
let tokens_compressed = (formatted.len() as u32 + 3) / 4;
if tokens_compressed < tokens_original {
let mode = self.engine.route_compression_mode(output);
if mode != sqz_engine::CompressionMode::Safe {
if let Ok(compressed) = self.engine.compress(&formatted) {
let _ = self.engine.cache_manager().store_compressed(output.as_bytes(), &compressed);
}
}
self.l1_cache.borrow_mut().insert(fast_hash);
self.log_compression(cmd, tokens_original, tokens_compressed);
return self.apply_context_refs(&formatted);
}
}
match self.compress_output(cmd, output) {
Ok(compressed) => {
let tokens_original = compressed.tokens_original;
let tokens_compressed = compressed.tokens_compressed;
let mode = self.engine.route_compression_mode(output);
if mode != sqz_engine::CompressionMode::Safe {
let _ = self.engine.cache_manager().store_compressed(output.as_bytes(), &compressed);
}
self.l1_cache.borrow_mut().insert(fast_hash);
self.log_compression(cmd, tokens_original, tokens_compressed);
let mut abbr = self.abbreviator.borrow_mut();
abbr.observe(&compressed.data);
let abbreviated = match abbr.abbreviate(&compressed.data) {
Ok(result) if result.total_tokens_saved > 0 => {
eprintln!("[sqz] n-gram abbreviation: {} tokens saved", result.total_tokens_saved);
result.text
}
_ => compressed.data,
};
self.apply_context_refs(&abbreviated)
}
Err(e) => {
eprintln!("[sqz] fallback: compression error for command '{cmd}': {e}");
output.to_owned()
}
}
}
fn log_compression(&self, cmd: &str, original: u32, compressed: u32) {
let saved = original.saturating_sub(compressed);
let pct = if original > 0 { (saved as f64 / original as f64 * 100.0) as u32 } else { 0 };
eprintln!("[sqz] {}/{} tokens ({}% reduction) [{}]", compressed, original, pct, cmd);
let _ = self.engine.session_store().log_compression(
original, compressed, &[], cmd,
);
}
fn compress_output(
&self,
_cmd: &str,
output: &str,
) -> sqz_engine::Result<CompressedContent> {
self.engine.compress(output)
}
fn apply_context_refs(&self, text: &str) -> String {
let known = match self.engine.session_store().known_files() {
Ok(files) => files,
Err(_) => return text.to_string(),
};
if known.is_empty() {
return text.to_string();
}
let mut result = text.to_string();
for file_path in &known {
let marker = format!("--> {}", file_path);
if result.contains(&marker) {
let note = format!("{} [in context]", marker);
result = result.replace(&marker, ¬e);
}
let at_marker = format!("at {}:", file_path);
if result.contains(&at_marker) {
let note = format!("at {} [in context]:", file_path);
result = result.replace(&at_marker, ¬e);
}
}
result
}
fn track_file(&self, cmd: &str, output: &str) {
let parts: Vec<&str> = cmd.split_whitespace().collect();
let base = parts.first().map(|s| s.rsplit('/').next().unwrap_or(s)).unwrap_or("");
match base {
"cat" | "head" | "tail" | "less" | "bat" => {
if let Some(path) = parts.last() {
if Path::new(path).extension().is_some() {
let _ = self.engine.session_store().add_known_file(path);
self.predictive_precache(path, output);
}
}
}
_ => {}
}
}
fn predictive_precache(&self, file_path: &str, content: &str) {
let path = Path::new(file_path);
self.dep_mapper.borrow_mut().add_file(path, content);
let deps = self.dep_mapper.borrow().dependencies_of(path);
if deps.is_empty() {
return;
}
let mut precached = 0;
for dep_path in &deps {
let resolved = if dep_path.is_absolute() {
dep_path.clone()
} else if let Some(parent) = path.parent() {
parent.join(dep_path)
} else {
dep_path.clone()
};
if resolved.exists() && resolved.is_file() {
if let Ok(dep_content) = std::fs::read_to_string(&resolved) {
if let Ok(Some(_)) = self.engine.cache_manager().check_dedup(dep_content.as_bytes()) {
continue; }
if let Ok(compressed) = self.engine.compress(&dep_content) {
let _ = self.engine.cache_manager().store_compressed(
dep_content.as_bytes(), &compressed,
);
let hash = content_hash(&dep_content);
self.l1_cache.borrow_mut().insert(hash);
let dep_str = resolved.to_string_lossy().to_string();
let _ = self.engine.session_store().add_known_file(&dep_str);
precached += 1;
}
}
}
}
if precached > 0 {
eprintln!("[sqz] predictive pre-cache: {} dependencies of {} cached",
precached, file_path);
}
}
#[allow(dead_code)]
pub fn is_known_command(cmd: &str) -> bool {
let base = cmd
.split_whitespace()
.next()
.unwrap_or("")
.rsplit('/')
.next()
.unwrap_or("");
CLI_PATTERNS
.iter()
.any(|p| base.eq_ignore_ascii_case(p))
}
pub fn run_proxy(&self) -> sqz_engine::Result<()> {
use std::io::{self, BufRead, Write};
let stdin = io::stdin();
let stdout = io::stdout();
let mut out = stdout.lock();
let mut buf = String::new();
for line in stdin.lock().lines() {
let line = line.map_err(|e| sqz_engine::SqzError::Other(e.to_string()))?;
buf.push_str(&line);
buf.push('\n');
}
let cmd = std::env::var("SQZ_CMD").unwrap_or_else(|_| "stdin".to_string());
let compressed = self.intercept_output(&cmd, &buf);
out.write_all(compressed.as_bytes())
.map_err(|e| sqz_engine::SqzError::Other(e.to_string()))?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_known_command_git() {
assert!(CliProxy::is_known_command("git"));
assert!(CliProxy::is_known_command("/usr/bin/git"));
assert!(CliProxy::is_known_command("git status"));
}
#[test]
fn test_is_known_command_unknown() {
assert!(!CliProxy::is_known_command("my_custom_tool"));
}
#[test]
fn test_patterns_count() {
assert!(
CLI_PATTERNS.len() >= 90,
"expected ≥90 patterns, got {}",
CLI_PATTERNS.len()
);
}
#[test]
fn test_intercept_output_returns_string() {
let proxy = CliProxy::new().expect("engine init");
let output = "hello world\nsome output\n";
let result = proxy.intercept_output("echo", output);
assert!(!result.is_empty());
}
#[test]
fn test_intercept_output_fallback_on_empty() {
let proxy = CliProxy::new().expect("engine init");
let result = proxy.intercept_output("git", "");
let _ = result;
}
#[test]
fn test_dedup_cache_returns_ref_on_second_call() {
let proxy = CliProxy::new().expect("engine init");
let output = "some repeated output that is long enough to be meaningful\n".repeat(5);
let first = proxy.intercept_output("echo", &output);
let second = proxy.intercept_output("echo", &output);
assert!(second.starts_with("§ref:"), "expected dedup ref, got: {}", second);
assert!(second.len() < first.len() || first.starts_with("§ref:"),
"dedup ref should be shorter than original");
}
#[test]
fn test_file_tracking_on_cat() {
let proxy = CliProxy::new().expect("engine init");
let content = "use std::io;\nfn main() {}\n";
proxy.intercept_output("cat src/main.rs", content);
let known = proxy.engine.session_store().known_files().unwrap();
assert!(known.contains(&"src/main.rs".to_string()), "cat should track the file path");
}
#[test]
fn test_context_refs_annotate_known_files() {
let proxy = CliProxy::new().expect("engine init");
let _ = proxy.engine.session_store().add_known_file("src/auth.rs");
let error = "error[E0308]: mismatched types\n --> src/auth.rs:42:5\n";
let result = proxy.apply_context_refs(error);
assert!(result.contains("[in context]"), "should annotate known file: {}", result);
}
#[test]
fn test_context_refs_no_annotation_for_unknown_files() {
let proxy = CliProxy::new().expect("engine init");
let error = "error[E0308]: mismatched types\n --> src/unknown.rs:42:5\n";
let result = proxy.apply_context_refs(error);
assert!(!result.contains("[in context]"), "should not annotate unknown file");
}
#[test]
fn test_reddit_packages_not_abbreviated() {
let proxy = CliProxy::new().expect("engine init");
let output = "drwxr-xr-x 5 user user 4096 Apr 15 10:00 packages\n\
drwxr-xr-x 3 user user 4096 Apr 15 10:00 configuration\n\
drwxr-xr-x 2 user user 4096 Apr 15 10:00 documentation\n";
let result = proxy.intercept_output("ls -la", output);
assert!(result.contains("packages"), "directory name 'packages' must not be abbreviated: {result}");
assert!(result.contains("configuration"), "directory name 'configuration' must not be abbreviated: {result}");
assert!(result.contains("documentation"), "directory name 'documentation' must not be abbreviated: {result}");
}
#[test]
fn test_paths_preserved_in_output() {
let proxy = CliProxy::new().expect("engine init");
let output = "/etc/myapp/configuration/default.yml\n\
/usr/share/documentation/readme.md\n\
/home/user/.local/environment/config\n";
let result = proxy.intercept_output("find /etc -name '*.yml'", output);
assert!(result.contains("configuration"), "path segment 'configuration' must be preserved: {result}");
assert!(result.contains("documentation"), "path segment 'documentation' must be preserved: {result}");
assert!(result.contains("environment"), "path segment 'environment' must be preserved: {result}");
}
#[test]
fn test_git_urls_preserved() {
let proxy = CliProxy::new().expect("engine init");
let output = "origin\thttps://github.com/example/repository.git (fetch)\n\
origin\thttps://github.com/example/repository.git (push)\n";
let result = proxy.intercept_output("git remote -v", output);
assert!(result.contains("repository"), "URL segment 'repository' must be preserved: {result}");
}
#[test]
fn test_identifiers_preserved_in_code_output() {
let proxy = CliProxy::new().expect("engine init");
let output = "error[E0433]: failed to resolve: use of undeclared crate or module `implementation`\n\
--> src/main.rs:5:5\n\
5 | use implementation::Config;\n";
let result = proxy.intercept_output("cargo build", output);
assert!(result.contains("implementation"), "identifier 'implementation' must be preserved: {result}");
}
#[test]
fn test_ls_output_preserves_all_filenames_through_rle() {
let proxy = CliProxy::new().expect("engine init");
let output = "total 24\n\
drwxr-xr-x 6 user user 192 Apr 18 10:00 packages\n\
drwxr-xr-x 3 user user 96 Apr 18 10:00 configuration\n\
drwxr-xr-x 4 user user 128 Apr 18 10:00 documentation\n\
drwxr-xr-x 2 user user 64 Apr 18 10:00 environment\n\
-rw-r--r-- 1 user user 1024 Apr 18 10:00 README.md\n\
-rw-r--r-- 1 user user 512 Apr 18 10:00 Cargo.toml\n\
-rw-r--r-- 1 user user 256 Apr 18 10:00 LICENSE\n";
let result = proxy.intercept_output("ls -la", output);
for name in &["packages", "configuration", "documentation", "environment",
"README.md", "Cargo.toml", "LICENSE"] {
assert!(result.contains(name),
"filename '{name}' must appear in output — got:\n{result}");
}
assert!(!result.contains("unique values"),
"RLE pattern-run summary must not replace real filenames — got:\n{result}");
}
}