use tree_sitter::{Node, Parser};
struct Token {
text: String,
line: usize,
byte_start: usize,
byte_end: usize,
separator: String,
}
fn is_word_char(c: u8) -> bool {
c.is_ascii_alphanumeric() || c == b'_'
}
fn is_operator_char(c: u8) -> bool {
matches!(
c,
b'=' | b'+' | b'-' | b'*' | b'/' | b'<' | b'>' | b'&' | b'|' | b'^' | b'%' | b':' | b'~'
)
}
pub fn language_for_ext(ext: &str) -> Option<tree_sitter::Language> {
let e = ext.trim_start_matches('.').to_ascii_lowercase();
match e.as_str() {
#[cfg(feature = "semantic-rust")]
"rs" => Some(tree_sitter_rust::LANGUAGE.into()),
#[cfg(feature = "semantic-java")]
"java" => Some(tree_sitter_java::LANGUAGE.into()),
#[cfg(feature = "semantic-go")]
"go" => Some(tree_sitter_go::LANGUAGE.into()),
#[cfg(feature = "semantic-bash")]
"sh" | "bash" => Some(tree_sitter_bash::LANGUAGE.into()),
#[cfg(feature = "semantic-clojure")]
"clj" | "cljs" | "cljc" | "edn" | "bb" => Some(tree_sitter_clojure::LANGUAGE.into()),
#[cfg(feature = "semantic-python")]
"py" => Some(tree_sitter_python::LANGUAGE.into()),
#[cfg(feature = "semantic-ruby")]
"rb" | "rake" | "gemspec" => Some(tree_sitter_ruby::LANGUAGE.into()),
#[cfg(feature = "semantic-elixir")]
"ex" | "exs" => Some(tree_sitter_elixir::LANGUAGE.into()),
#[cfg(feature = "semantic-c")]
"c" | "h" => Some(tree_sitter_c::LANGUAGE.into()),
#[cfg(feature = "semantic-cpp")]
"cpp" | "cc" | "cxx" | "hpp" | "hh" => Some(tree_sitter_cpp::LANGUAGE.into()),
#[cfg(feature = "semantic-ts")]
"ts" => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
#[cfg(feature = "semantic-ts")]
"tsx" => Some(tree_sitter_typescript::LANGUAGE_TSX.into()),
_ => None,
}
}
pub fn minify(ext: &str, source: &str) -> Option<String> {
minify_with_spans(ext, source).map(|(out, _)| out)
}
fn minify_with_spans(ext: &str, source: &str) -> Option<(String, Vec<Span>)> {
let language = language_for_ext(ext)?;
let mut parser = Parser::new();
parser.set_language(&language).ok()?;
let tree = parser.parse(source, None)?;
let root = tree.root_node();
if root.has_error() {
return None;
}
let src = source.as_bytes();
let mut tokens: Vec<Token> = Vec::new();
collect_leaves(root, src, &mut tokens);
annotate(ext, &mut tokens, src);
let (out, spans) = render_with_spans(&tokens);
if out.is_empty() {
return None;
}
let reparsed = parser.parse(&out, None)?;
if reparsed.root_node().has_error() {
return None;
}
Some((out, spans))
}
fn collect_leaves(node: Node, src: &[u8], tokens: &mut Vec<Token>) {
let kind = node.kind();
if matches!(kind, "\n" | "\t" | " ") {
return;
}
if matches!(
kind,
"comment" | "line_comment" | "block_comment" | "multiline_comment"
) {
return;
}
if node.child_count() == 0 {
if let Ok(text) = node.utf8_text(src) {
if !text.is_empty() {
tokens.push(Token {
text: text.to_string(),
line: node.start_position().row,
byte_start: node.start_byte(),
byte_end: node.end_byte(),
separator: String::new(),
});
}
}
return;
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
collect_leaves(child, src, tokens);
}
}
fn annotate(ext: &str, tokens: &mut [Token], source: &[u8]) {
match ext.trim_start_matches('.').to_ascii_lowercase().as_str() {
"go" => annotate_go(tokens),
"clj" | "cljs" | "cljc" | "edn" | "bb" => annotate_clojure(tokens),
"sh" | "bash" | "py" | "rb" | "rake" | "gemspec" | "ts" | "tsx" | "c" | "h" | "cpp"
| "cc" | "cxx" | "hpp" | "hh" | "ex" | "exs" => annotate_gap_preserve(tokens, source),
_ => {}
}
}
fn annotate_gap_preserve(tokens: &mut [Token], source: &[u8]) {
for i in 1..tokens.len() {
let (start, end) = (tokens[i - 1].byte_end, tokens[i].byte_start);
if start >= end || end > source.len() {
continue;
}
let ws: Vec<u8> = source[start..end]
.iter()
.copied()
.filter(|&c| c == b' ' || c == b'\t' || c == b'\n')
.collect();
if ws.is_empty() {
continue;
}
let mut collapsed = Vec::with_capacity(ws.len());
let mut prev_nl = false;
for &c in &ws {
if c == b'\n' {
if prev_nl {
continue;
}
prev_nl = true;
} else {
prev_nl = false;
}
collapsed.push(c);
}
tokens[i - 1].separator = String::from_utf8_lossy(&collapsed).into_owned();
}
}
fn annotate_clojure(tokens: &mut [Token]) {
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].byte_end < tokens[i + 1].byte_start {
tokens[i].separator = " ".to_string();
}
}
}
fn annotate_go(tokens: &mut [Token]) {
for i in 0..tokens.len().saturating_sub(1) {
let next_line = tokens[i + 1].line;
let next_text = &tokens[i + 1].text;
if next_line > tokens[i].line
&& go_semicolon_trigger(&tokens[i].text)
&& !is_closing_token(next_text)
{
tokens[i].separator = ";".to_string();
}
}
}
fn go_semicolon_trigger(text: &str) -> bool {
let Some(&last) = text.as_bytes().last() else {
return false;
};
if is_word_char(last) {
return true;
}
match last {
b')' | b']' | b'}' | b'"' | b'\'' | b'`' => true,
b'+' => text == "++",
b'-' => text == "--",
_ => false,
}
}
fn is_closing_token(text: &str) -> bool {
matches!(text, "}" | ")" | "]" | ",")
}
struct Span {
min_start: usize,
min_end: usize,
orig_start: usize,
orig_end: usize,
}
fn render_with_spans(tokens: &[Token]) -> (String, Vec<Span>) {
let mut out = String::new();
let mut spans = Vec::with_capacity(tokens.len());
for (i, tok) in tokens.iter().enumerate() {
if i > 0 {
let prev = &tokens[i - 1];
if !prev.separator.is_empty() {
out.push_str(&prev.separator);
}
if let Some(&last) = out.as_bytes().last() {
let first = tok.text.as_bytes()[0];
if last != b'\n' && is_word_char(last) && is_word_char(first) {
out.push(' ');
} else if is_operator_char(last) && first == b'.' {
out.push(' ');
}
}
}
let min_start = out.len();
out.push_str(&tok.text);
spans.push(Span {
min_start,
min_end: out.len(),
orig_start: tok.byte_start,
orig_end: tok.byte_end,
});
}
(out, spans)
}
#[derive(Debug, PartialEq, Eq)]
pub enum MinifiedEditError {
Unsupported,
NotFound,
NotUnique,
NotAligned,
}
pub fn apply_minified_edit(
ext: &str,
source: &str,
old_minified: &str,
new_text: &str,
) -> Result<String, MinifiedEditError> {
if old_minified.is_empty() {
return Err(MinifiedEditError::NotFound);
}
let (minified, spans) = minify_with_spans(ext, source).ok_or(MinifiedEditError::Unsupported)?;
match minified.matches(old_minified).count() {
0 => return Err(MinifiedEditError::NotFound),
1 => {}
_ => return Err(MinifiedEditError::NotUnique),
}
let m_start = minified.find(old_minified).unwrap();
let m_end = m_start + old_minified.len();
let orig_start = spans
.iter()
.find(|s| s.min_start == m_start)
.ok_or(MinifiedEditError::NotAligned)?
.orig_start;
let orig_end = spans
.iter()
.find(|s| s.min_end == m_end)
.ok_or(MinifiedEditError::NotAligned)?
.orig_end;
if orig_start > orig_end
|| orig_end > source.len()
|| !source.is_char_boundary(orig_start)
|| !source.is_char_boundary(orig_end)
{
return Err(MinifiedEditError::NotAligned);
}
let mut result = String::with_capacity(source.len() - (orig_end - orig_start) + new_text.len());
result.push_str(&source[..orig_start]);
result.push_str(new_text);
result.push_str(&source[orig_end..]);
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn unsupported_language_returns_none() {
assert!(language_for_ext("md").is_none());
assert!(language_for_ext("json").is_none());
assert!(minify("md", "# hi\n\nsome text").is_none());
}
#[test]
fn non_source_extensions_are_gated_out() {
for ext in ["md", "json", "txt", "toml", "yaml", "yml", "lock", "png"] {
assert!(
language_for_ext(ext).is_none(),
"{ext} has no grammar; must fall back"
);
}
}
#[cfg(feature = "semantic-rust")]
#[test]
fn rust_minify_strips_comments_and_collapses_safely() {
let src =
"// a comment\nfn add ( a : i32 , b : i32 ) -> i32 {\n // inner\n a + b\n}\n";
let out = minify("rs", src).expect("rust minifies");
assert!(!out.contains("comment"), "comments stripped: {out}");
assert!(!out.contains("inner"));
assert!(out.contains("fn add"), "word boundary kept: {out}");
assert!(!out.contains(" "), "no double spaces: {out}");
assert!(minify("rs", &out).is_some(), "minified output still parses");
}
#[cfg(feature = "semantic-java")]
#[test]
fn java_minify_preserves_string_literals_and_boundaries() {
let src = "// header\nclass A {\n String s = \"hello world\"; // keep spaces in string\n int x = 1 ;\n}\n";
let out = minify("java", src).expect("java minifies");
assert!(!out.contains("header"), "comment stripped: {out}");
assert!(
!out.contains("keep spaces"),
"trailing comment stripped: {out}"
);
assert!(
out.contains("\"hello world\""),
"string literal intact: {out}"
);
assert!(out.contains("class A"), "word boundary kept: {out}");
}
#[cfg(feature = "semantic-go")]
#[test]
fn go_minify_reinserts_auto_semicolons() {
let src = "package main\nfunc main() {\n\tx := 1\n\ty := 2\n\t_ = x + y\n}\n";
let out = minify("go", src).expect("go minifies via annotate_go");
assert!(out.contains("x:=1;"), "auto-semicolon re-inserted: {out}");
assert!(!out.contains("comment"));
assert!(minify("go", &out).is_some(), "minified Go re-parses: {out}");
}
#[cfg(feature = "semantic-go")]
#[test]
fn go_minify_round_trips_with_comments_and_blocks() {
let src = "package main\n// doc\nimport \"fmt\"\nfunc greet(n string) string {\n\tif n == \"\" {\n\t\treturn \"hi\"\n\t}\n\treturn fmt.Sprintf(\"hi %s\", n)\n}\n";
let out = minify("go", src).expect("go minifies");
assert!(!out.contains("doc"), "comment stripped: {out}");
assert!(out.contains("func greet"), "word boundary kept: {out}");
assert!(out.contains("\"hi %s\""), "string literal intact: {out}");
assert!(minify("go", &out).is_some(), "re-parses: {out}");
}
#[test]
fn go_semicolon_trigger_matches_spec() {
assert!(go_semicolon_trigger("x")); assert!(go_semicolon_trigger("123")); assert!(go_semicolon_trigger(")"));
assert!(go_semicolon_trigger("}"));
assert!(go_semicolon_trigger("++"));
assert!(go_semicolon_trigger("--"));
assert!(!go_semicolon_trigger("+")); assert!(!go_semicolon_trigger("{"));
assert!(!go_semicolon_trigger(""));
}
#[cfg(feature = "semantic-clojure")]
#[test]
fn clojure_keeps_operator_symbol_boundaries() {
let src = "; a comment\n(defn f [x]\n (-> x\n inc\n (+ 2)))\n";
let out = minify("clj", src).expect("clojure minifies");
assert!(!out.contains("comment"), "comment stripped: {out}");
assert!(out.contains("-> x"), "operator/atom boundary kept: {out}");
assert!(!out.contains("->x"), "must not merge symbols: {out}");
assert!(out.contains("defn f"), "atoms stay separated: {out}");
assert!(
minify("clj", &out).is_some(),
"minified clojure re-parses: {out}"
);
}
#[cfg(feature = "semantic-clojure")]
#[test]
fn clojure_no_space_around_delimiters() {
let src = "(list 1 2 3)\n";
let out = minify("clj", src).expect("minifies");
assert!(
out.contains("(list 1 2 3)"),
"delimiter adjacency preserved: {out}"
);
}
#[cfg(feature = "semantic-bash")]
#[test]
fn bash_preserves_command_newlines() {
let src = "# comment\necho a\n\n\necho b\n";
let out = minify("sh", src).expect("bash minifies");
assert!(!out.contains("comment"), "comment stripped: {out}");
assert!(out.contains("echo a"), "{out}");
assert!(out.contains("echo b"), "{out}");
assert!(!out.contains("aecho"), "commands stay separated: {out:?}");
assert!(minify("sh", &out).is_some(), "re-parses: {out}");
}
#[cfg(feature = "semantic-ts")]
#[test]
fn typescript_gap_preserve_keeps_asi_newlines() {
let src = "// c\nconst a = 1\nconst b = 2\nconsole.log(a + b)\n";
let out = minify("ts", src).expect("ts minifies (gap-preserve)");
assert!(!out.contains("// c"), "comment stripped: {out}");
assert!(out.contains('\n'), "newlines preserved for ASI: {out:?}");
assert!(minify("ts", &out).is_some(), "re-parses: {out}");
}
#[cfg(feature = "semantic-c")]
#[test]
fn c_gap_preserve_keeps_preprocessor_lines() {
let src = "#include <stdio.h>\nint main(void) {\n return 0; // ok\n}\n";
let out = minify("c", src).expect("c minifies (gap-preserve)");
assert!(
out.contains("#include <stdio.h>"),
"preproc intact: {out:?}"
);
assert!(out.contains('\n'), "preproc newline kept: {out:?}");
assert!(!out.contains("// ok"), "comment stripped: {out}");
assert!(minify("c", &out).is_some(), "re-parses: {out}");
}
#[cfg(feature = "semantic-python")]
#[test]
fn python_gap_preserve_keeps_indentation() {
let src = "# doc\ndef f(x):\n if x:\n return 1\n return 0\n";
let out = minify("py", src).expect("python minifies (gap-preserve)");
assert!(!out.contains("# doc"), "comment stripped: {out}");
assert!(
out.contains(" return 1"),
"indentation preserved: {out:?}"
);
assert!(minify("py", &out).is_some(), "re-parses: {out}");
}
#[cfg(feature = "semantic-rust")]
#[test]
fn syntactically_broken_input_is_not_minified() {
assert!(minify("rs", "fn broken( {{{ ").is_none());
}
#[cfg(feature = "semantic-rust")]
#[test]
fn minified_edit_maps_back_to_original_preserving_formatting() {
let src = "fn main() {\n let x = 1;\n let y = 2;\n}\n";
assert_eq!(minify("rs", src).unwrap(), "fn main(){let x=1;let y=2;}");
let out = apply_minified_edit("rs", src, "let x=1", "let x = 42").unwrap();
assert_eq!(out, "fn main() {\n let x = 42;\n let y = 2;\n}\n");
assert!(minify("rs", &out).is_some());
}
#[cfg(feature = "semantic-rust")]
#[test]
fn minified_edit_rejects_not_found_not_unique_and_misaligned() {
let src = "fn main() {\n let x = 1;\n let y = 2;\n}\n";
assert_eq!(
apply_minified_edit("rs", src, "nonexistent", "x"),
Err(MinifiedEditError::NotFound)
);
assert_eq!(
apply_minified_edit("rs", src, "let ", "x"),
Err(MinifiedEditError::NotUnique)
);
assert_eq!(
apply_minified_edit("rs", src, "ain(", "x"),
Err(MinifiedEditError::NotAligned)
);
assert_eq!(
apply_minified_edit("rs", src, "", "x"),
Err(MinifiedEditError::NotFound)
);
}
#[test]
fn minified_edit_unsupported_language() {
assert_eq!(
apply_minified_edit("md", "# hi\n", "hi", "bye"),
Err(MinifiedEditError::Unsupported)
);
}
#[test]
fn minifies_real_repo_files() {
let root = env!("CARGO_MANIFEST_DIR");
let mut cases: Vec<(&str, &str)> = Vec::new();
#[cfg(feature = "semantic-rust")]
{
cases.push(("src/agent/agent_loop/run.rs", "rs"));
cases.push(("src/agent/tools/cache.rs", "rs"));
cases.push(("src/semantic/minify.rs", "rs"));
}
assert!(!cases.is_empty(), "no collapse-safe grammar compiled in");
let mut total_in = 0usize;
let mut total_out = 0usize;
for (rel, ext) in cases {
let path = std::path::Path::new(root).join(rel);
let src = std::fs::read_to_string(&path).unwrap_or_else(|e| panic!("read {rel}: {e}"));
let min = minify(ext, &src)
.unwrap_or_else(|| panic!("{rel} should minify (clean parse + revalidate)"));
assert!(
min.len() < src.len(),
"{rel}: minified ({}) not smaller than source ({})",
min.len(),
src.len()
);
assert!(
minify(ext, &min).is_some(),
"{rel}: minified output must re-parse"
);
let pct = 100.0 * (1.0 - min.len() as f64 / src.len() as f64);
eprintln!(
"minify {rel:50} {:>7} -> {:>7} bytes ({pct:4.1}% saved)",
src.len(),
min.len()
);
total_in += src.len();
total_out += min.len();
}
let pct = 100.0 * (1.0 - total_out as f64 / total_in.max(1) as f64);
eprintln!(
"minify {:50} {:>7} -> {:>7} bytes ({pct:4.1}% saved)",
"TOTAL", total_in, total_out
);
}
}