mod locale;
mod locales;
pub use locale::Locale;
use std::path::{Path, PathBuf};
use crate::session::home_dir;
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum Mode {
Full,
Ultra,
}
#[derive(Debug, Clone, Default)]
pub struct Stats {
pub orig_bytes: usize,
pub new_bytes: usize,
pub orig_code_blocks: usize,
pub new_code_blocks: usize,
pub orig_urls: usize,
pub new_urls: usize,
pub orig_headings: usize,
pub new_headings: usize,
}
#[derive(Debug, Clone)]
pub struct CompressResult {
pub output: String,
pub stats: Stats,
pub safe: bool,
}
pub fn run(args: &[String]) -> i32 {
let mut mode = Mode::Full;
let mut dry_run = false;
let mut all = false;
let mut quiet = false;
let mut targets: Vec<String> = Vec::new();
let mut lang_cli: Option<String> = None;
let mut i = 0;
while i < args.len() {
let a = &args[i];
match a.as_str() {
"--ultra" => mode = Mode::Ultra,
"--dry-run" => dry_run = true,
"--all" => all = true,
"--quiet" => quiet = true,
"--lang" => {
if i + 1 >= args.len() {
eprintln!("squeez compress-md: --lang requires a value");
return 2;
}
i += 1;
lang_cli = Some(args[i].clone());
}
"-h" | "--help" => {
print_help();
return 0;
}
s if s.starts_with("--") => {
eprintln!("squeez compress-md: unknown flag {}", s);
return 2;
}
s => targets.push(s.to_string()),
}
i += 1;
}
let locale = {
let code = lang_cli.unwrap_or_else(|| crate::config::Config::load().lang);
Locale::from_code(&code)
};
let files: Vec<PathBuf> = if all {
all_targets()
} else if targets.is_empty() {
eprintln!("squeez compress-md: no files given (use --all or pass paths)");
return 2;
} else {
targets.iter().map(PathBuf::from).collect()
};
let mut had_error = false;
let mut any_processed = false;
for f in &files {
if !f.exists() {
if !all && !quiet {
eprintln!("squeez compress-md: not found: {}", f.display());
}
continue;
}
any_processed = true;
match process_file(f, mode, dry_run, quiet, locale) {
Ok(()) => {}
Err(e) => {
eprintln!("squeez compress-md: {} — {}", f.display(), e);
had_error = true;
}
}
}
if !any_processed && all && !quiet {
eprintln!("squeez compress-md: no markdown files found in known locations");
}
if had_error {
1
} else {
0
}
}
pub fn run_all_quietly() -> i32 {
let cfg = crate::config::Config::load();
let locale = Locale::from_code(&cfg.lang);
let files = all_targets();
for f in &files {
if !f.exists() {
continue;
}
let _ = process_file(f, Mode::Ultra, false, true, locale);
}
0
}
fn print_help() {
println!("squeez compress-md — pure-Rust markdown prose compressor");
println!();
println!("Usage:");
println!(" squeez compress-md [--ultra] [--dry-run] <file>...");
println!(" squeez compress-md [--ultra] [--dry-run] --all");
println!();
println!("Flags:");
println!(" --ultra Aggressive abbreviations (with→w/, function→fn, ...)");
println!(" --dry-run Print compressed text to stdout, do not write");
println!(" --all Walk known locations: ~/.claude/CLAUDE.md,");
println!(" ~/.copilot/copilot-instructions.md,");
println!(" $PWD/CLAUDE.md, $PWD/AGENTS.md,");
println!(" $PWD/.github/copilot-instructions.md");
println!(" --quiet Suppress informational output");
println!(" --lang <code> Locale: en (default), pt-BR. Overrides config 'lang'.");
println!();
println!("Preserved verbatim: code blocks (```...```), inline `code`,");
println!("URLs, file paths, headings, tables, list markers, version numbers.");
println!();
println!("Backups are written to <stem>.original.md and never overwritten.");
}
fn all_targets() -> Vec<PathBuf> {
let home = home_dir();
let mut v = vec![
PathBuf::from(format!("{}/.claude/CLAUDE.md", home)),
PathBuf::from(format!("{}/.copilot/copilot-instructions.md", home)),
];
if let Ok(cwd) = std::env::current_dir() {
v.push(cwd.join("CLAUDE.md"));
v.push(cwd.join("AGENTS.md"));
v.push(cwd.join(".github/copilot-instructions.md"));
}
v
}
fn process_file(
path: &Path,
mode: Mode,
dry_run: bool,
quiet: bool,
locale: &'static Locale,
) -> Result<(), String> {
let original = std::fs::read_to_string(path).map_err(|e| e.to_string())?;
let result = compress_text_with_locale(&original, mode, locale);
if !result.safe {
return Err(format!(
"integrity check failed (code_blocks {}→{}, urls {}→{}, headings {}→{}, bytes {}→{})",
result.stats.orig_code_blocks,
result.stats.new_code_blocks,
result.stats.orig_urls,
result.stats.new_urls,
result.stats.orig_headings,
result.stats.new_headings,
result.stats.orig_bytes,
result.stats.new_bytes,
));
}
let pct = if result.stats.orig_bytes > 0 {
100usize.saturating_sub(result.stats.new_bytes * 100 / result.stats.orig_bytes)
} else {
0
};
if dry_run {
print!("{}", result.output);
if !quiet {
eprintln!(
"# squeez compress-md (dry-run) {} {}→{} bytes (-{}%)",
path.display(),
result.stats.orig_bytes,
result.stats.new_bytes,
pct
);
}
return Ok(());
}
if result.stats.new_bytes >= result.stats.orig_bytes {
if !quiet {
eprintln!(
"squeez compress-md: {} already compressed (no further reduction)",
path.display()
);
}
return Ok(());
}
let backup = backup_path(path);
if !backup.exists() {
std::fs::write(&backup, &original).map_err(|e| e.to_string())?;
}
std::fs::write(path, &result.output).map_err(|e| e.to_string())?;
if !quiet {
eprintln!(
"squeez compress-md: {} {}→{} bytes (-{}%)",
path.display(),
result.stats.orig_bytes,
result.stats.new_bytes,
pct
);
}
Ok(())
}
fn backup_path(p: &Path) -> PathBuf {
let stem = p.file_stem().and_then(|s| s.to_str()).unwrap_or("file");
let parent = p.parent().unwrap_or_else(|| Path::new("."));
parent.join(format!("{}.original.md", stem))
}
#[derive(Eq, PartialEq)]
enum State {
Text,
FencedCode,
Table,
}
pub fn compress_text(input: &str, mode: Mode) -> CompressResult {
compress_text_with_locale(input, mode, Locale::from_code("en"))
}
pub fn compress_text_with_locale(
input: &str,
mode: Mode,
locale: &'static Locale,
) -> CompressResult {
let mut stats = Stats {
orig_bytes: input.len(),
orig_code_blocks: count_code_blocks(input),
orig_urls: count_urls(input),
orig_headings: count_headings(input),
..Default::default()
};
let mut out = String::with_capacity(input.len());
let mut state = State::Text;
let lines: Vec<&str> = input.split('\n').collect();
let mut i = 0;
while i < lines.len() {
let line = lines[i];
match state {
State::FencedCode => {
out.push_str(line);
out.push('\n');
if line.trim_start().starts_with("```") {
state = State::Text;
}
i += 1;
}
State::Table => {
if is_table_row(line) {
out.push_str(line);
out.push('\n');
i += 1;
} else {
state = State::Text;
}
}
State::Text => {
if line.trim_start().starts_with("```") {
out.push_str(line);
out.push('\n');
state = State::FencedCode;
i += 1;
} else if is_table_row(line) {
out.push_str(line);
out.push('\n');
state = State::Table;
i += 1;
} else if is_protected_line(line) {
out.push_str(line);
out.push('\n');
i += 1;
} else {
let compressed = compress_prose_line(line, mode, locale);
out.push_str(&compressed);
out.push('\n');
i += 1;
}
}
}
}
if !input.ends_with('\n') && out.ends_with('\n') {
out.pop();
}
let collapsed = collapse_blank_runs(&out);
stats.new_bytes = collapsed.len();
stats.new_code_blocks = count_code_blocks(&collapsed);
stats.new_urls = count_urls(&collapsed);
stats.new_headings = count_headings(&collapsed);
let safe = stats.new_code_blocks == stats.orig_code_blocks
&& stats.new_urls >= stats.orig_urls
&& stats.new_headings == stats.orig_headings
&& stats.new_bytes * 5 >= stats.orig_bytes;
CompressResult {
output: collapsed,
stats,
safe,
}
}
fn is_table_row(s: &str) -> bool {
let t = s.trim_start();
t.starts_with('|') && t[1..].contains('|')
}
fn is_protected_line(s: &str) -> bool {
let t = s.trim_start();
t.is_empty()
|| t.starts_with('#')
|| t.starts_with("<!--")
|| t.starts_with('>')
|| t.starts_with("---")
|| t.starts_with("===")
}
fn count_code_blocks(s: &str) -> usize {
s.lines()
.filter(|l| l.trim_start().starts_with("```"))
.count()
/ 2
}
fn count_urls(s: &str) -> usize {
let mut n = 0;
let mut rest = s;
while let Some(idx) = rest.find("http") {
let after = &rest[idx..];
if after.starts_with("http://") || after.starts_with("https://") {
n += 1;
let end = after
.find(|c: char| c.is_whitespace() || c == ')' || c == ']' || c == '"')
.unwrap_or(after.len());
rest = &after[end..];
} else {
rest = &after[4..];
}
}
n
}
fn count_headings(s: &str) -> usize {
s.lines()
.filter(|l| l.trim_start().starts_with('#'))
.count()
}
fn collapse_blank_runs(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut blank_run = 0;
for line in s.split('\n') {
if line.trim().is_empty() {
blank_run += 1;
if blank_run <= 1 {
out.push('\n');
}
} else {
blank_run = 0;
out.push_str(line);
out.push('\n');
}
}
if !s.ends_with('\n') && out.ends_with('\n') {
out.pop();
}
out
}
#[derive(Debug)]
enum Span<'a> {
Verbatim(&'a str),
Prose(&'a str),
}
fn split_protected_spans(line: &str) -> Vec<Span<'_>> {
let mut spans = Vec::new();
let bytes = line.as_bytes();
let mut i = 0;
let mut prose_start = 0;
while i < bytes.len() {
let c = bytes[i] as char;
if c == '`' {
if prose_start < i {
spans.push(Span::Prose(&line[prose_start..i]));
}
let start = i;
i += 1;
while i < bytes.len() && bytes[i] != b'`' {
i += 1;
}
if i < bytes.len() {
i += 1; }
spans.push(Span::Verbatim(&line[start..i]));
prose_start = i;
continue;
}
if c == 'h' && (line[i..].starts_with("http://") || line[i..].starts_with("https://")) {
if prose_start < i {
spans.push(Span::Prose(&line[prose_start..i]));
}
let start = i;
while i < bytes.len() {
let cc = bytes[i] as char;
if cc.is_whitespace() || matches!(cc, ')' | ']' | '"' | '>') {
break;
}
i += 1;
}
spans.push(Span::Verbatim(&line[start..i]));
prose_start = i;
continue;
}
i += 1;
}
if prose_start < line.len() {
spans.push(Span::Prose(&line[prose_start..]));
}
spans
}
fn compress_prose_line(line: &str, mode: Mode, locale: &Locale) -> String {
let leading_ws_len = line.len() - line.trim_start().len();
let leading = &line[..leading_ws_len];
let body = &line[leading_ws_len..];
let (marker, rest) = split_list_marker(body);
let spans = split_protected_spans(rest);
let mut out = String::with_capacity(rest.len());
for span in spans {
match span {
Span::Verbatim(v) => out.push_str(v),
Span::Prose(p) => out.push_str(&compress_prose_span(p, mode, locale)),
}
}
let mut result = String::with_capacity(line.len());
result.push_str(leading);
result.push_str(marker);
result.push_str(&out);
while result.ends_with(' ') || result.ends_with('\t') {
result.pop();
}
result
}
fn split_list_marker(s: &str) -> (&str, &str) {
let bytes = s.as_bytes();
if bytes.is_empty() {
return ("", s);
}
if matches!(bytes[0], b'-' | b'*' | b'+') && bytes.get(1) == Some(&b' ') {
return (&s[..2], &s[2..]);
}
let mut i = 0;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i > 0
&& i + 1 < bytes.len()
&& (bytes[i] == b'.' || bytes[i] == b')')
&& bytes[i + 1] == b' '
{
return (&s[..i + 2], &s[i + 2..]);
}
("", s)
}
fn compress_prose_span(text: &str, mode: Mode, locale: &Locale) -> String {
if text.trim().is_empty() {
return text.to_string();
}
let mut s = text.to_string();
for phrase in locale.phrases {
s = drop_phrase_ci(&s, phrase);
}
let mut tokens: Vec<String> = Vec::new();
let mut buf = String::new();
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c.is_whitespace() {
if !buf.is_empty() {
tokens.push(std::mem::take(&mut buf));
}
tokens.push(c.to_string());
} else {
buf.push(c);
}
}
if !buf.is_empty() {
tokens.push(buf);
}
let mut kept: Vec<String> = Vec::with_capacity(tokens.len());
for tok in &tokens {
if tok.chars().all(|c| c.is_whitespace()) {
kept.push(tok.clone());
continue;
}
if is_clean_word(tok) {
let lower = strip_punct(&tok.to_lowercase());
if locale.fillers.contains(&lower.as_str())
|| locale.hedges.contains(&lower.as_str())
|| locale.articles.contains(&lower.as_str())
{
if matches!(kept.last().map(|s| s.as_str()), Some(s) if s.chars().all(|c| c.is_whitespace())) {
kept.pop();
}
continue;
}
}
kept.push(tok.clone());
}
let mut out = String::with_capacity(s.len());
let mut last_ws = false;
for tok in &kept {
if tok.chars().all(|c| c.is_whitespace()) {
if !last_ws {
out.push(' ');
last_ws = true;
}
} else {
out.push_str(tok);
last_ws = false;
}
}
let trimmed = trim_trailing_conjunction(out.trim_end(), locale);
let cleaned = strip_leading_orphan_punct(&trimmed);
let cleaned = clean_mid_orphan_punct(cleaned);
let final_out = if mode == Mode::Ultra {
ultra_subs(cleaned, locale)
} else {
cleaned
};
let needs_trailing = text.ends_with(' ') && !final_out.ends_with(' ');
let needs_leading = text.starts_with(' ') && !final_out.starts_with(' ');
match (needs_leading, needs_trailing) {
(true, true) => format!(" {} ", final_out),
(true, false) => format!(" {}", final_out),
(false, true) => format!("{} ", final_out),
(false, false) => final_out,
}
}
fn clean_mid_orphan_punct(s: String) -> String {
let mut out = String::with_capacity(s.len());
let chars: Vec<char> = s.chars().collect();
let mut i = 0;
while i < chars.len() {
let c = chars[i];
if matches!(c, '.' | '!' | '?')
&& chars.get(i + 1) == Some(&' ')
&& matches!(chars.get(i + 2), Some(&',') | Some(&';'))
{
out.push(c); out.push(' '); i += 3; while i < chars.len() && chars[i] == ' ' { i += 1; }
continue;
}
if c == ' '
&& matches!(chars.get(i + 1), Some(&',') | Some(&';'))
&& chars.get(i + 2) == Some(&' ')
{
out.push(' ');
i += 3;
continue;
}
out.push(c);
i += 1;
}
out
}
fn strip_leading_orphan_punct(s: &str) -> String {
let trimmed = s.trim_start();
let mut chars = trimmed.chars().peekable();
let mut to_skip = 0;
while let Some(&c) = chars.peek() {
if matches!(c, ',' | ';' | ':' | ' ') {
to_skip += c.len_utf8();
chars.next();
} else {
break;
}
}
let lead_ws = s.len() - trimmed.len();
let original_lead = &s[..lead_ws];
let body = &trimmed[to_skip..];
let body = body.trim_start();
format!("{}{}", original_lead, body)
}
fn strip_punct(s: &str) -> String {
s.chars()
.filter(|c| c.is_alphanumeric() || *c == '\'' || *c == '/')
.collect()
}
fn is_clean_word(tok: &str) -> bool {
let mut chars = tok.chars().peekable();
let mut body_len = 0;
while let Some(&c) = chars.peek() {
if c.is_alphanumeric() || c == '\'' {
chars.next();
body_len += 1;
} else {
break;
}
}
if body_len == 0 {
return false;
}
for c in chars {
if !matches!(c, ',' | '.' | ';' | ':' | '!' | '?') {
return false;
}
}
true
}
fn drop_phrase_ci(s: &str, needle: &str) -> String {
let lower: String = s.chars().flat_map(char::to_lowercase).collect();
let mut out = String::with_capacity(s.len());
let mut s_i = 0usize; let mut l_i = 0usize;
while s_i < s.len() {
debug_assert!(l_i <= lower.len(), "l_i cursor must not exceed lower.len()");
if lower[l_i..].starts_with(needle) {
let l_end = l_i + needle.len();
while l_i < l_end {
let ch = s[s_i..].chars().next().unwrap();
s_i += ch.len_utf8();
l_i += ch.to_lowercase().map(|c| c.len_utf8()).sum::<usize>();
}
while s_i < s.len() && s.as_bytes()[s_i] == b' ' {
s_i += 1;
l_i += 1;
}
} else {
let ch = s[s_i..].chars().next().unwrap();
out.push(ch);
s_i += ch.len_utf8();
l_i += ch.to_lowercase().map(|c| c.len_utf8()).sum::<usize>();
}
}
out
}
fn trim_trailing_conjunction(s: &str, locale: &Locale) -> String {
let lower = s.to_lowercase();
for c in locale.conjunctions {
if lower.ends_with(c) {
return s[..s.len() - c.len()].trim_end().to_string();
}
}
s.to_string()
}
fn ultra_subs(mut s: String, locale: &Locale) -> String {
for (long, short) in locale.ultra_subs {
s = replace_word_boundary(&s, long, short);
}
s
}
fn is_word_char_unicode(c: char) -> bool {
c.is_alphanumeric() || c == '_'
}
fn replace_word_boundary(s: &str, needle: &str, repl: &str) -> String {
let needle_lower: String = needle.chars().flat_map(char::to_lowercase).collect();
let chars: Vec<(usize, char)> = s.char_indices().collect();
let mut out = String::with_capacity(s.len());
let mut i = 0;
while i < chars.len() {
let mut buf = String::new();
let mut j = i;
let mut matched = false;
while j < chars.len() {
for lc in chars[j].1.to_lowercase() {
buf.push(lc);
}
j += 1;
if buf == needle_lower {
matched = true;
break;
}
if !needle_lower.starts_with(&buf as &str) {
break;
}
}
if matched {
let prev_ok = i == 0 || !is_word_char_unicode(chars[i - 1].1);
let next_ok = j == chars.len() || !is_word_char_unicode(chars[j].1);
if prev_ok && next_ok {
out.push_str(repl);
i = j;
continue;
}
}
out.push(chars[i].1);
i += 1;
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn count_code_blocks_pairs_fences() {
let s = "intro\n```\ncode\n```\nmid\n```rust\nx\n```\n";
assert_eq!(count_code_blocks(s), 2);
}
#[test]
fn fenced_code_preserved_verbatim() {
let input = "Some prose with the article.\n```rust\nfn main() { let x = 1; }\n```\nMore prose.\n";
let r = compress_text(input, Mode::Full);
assert!(r.safe);
assert!(r.output.contains("fn main() { let x = 1; }"));
assert!(!r.output.contains("the article"));
}
#[test]
fn url_preserved_inline() {
let input = "Check https://example.com/foo for the docs.\n";
let r = compress_text(input, Mode::Full);
assert!(r.safe);
assert!(r.output.contains("https://example.com/foo"));
}
#[test]
fn markdown_link_url_preserved() {
let input = "See [the docs](https://example.com/x) for more.\n";
let r = compress_text(input, Mode::Full);
assert!(r.safe);
assert!(r.output.contains("https://example.com/x"));
}
#[test]
fn heading_count_unchanged() {
let input = "# H1\n\nprose\n\n## H2\n\nmore prose with the article\n";
let r = compress_text(input, Mode::Full);
assert_eq!(r.stats.orig_headings, r.stats.new_headings);
}
#[test]
fn fillers_removed() {
let input = "This is just really basically a simple test.\n";
let r = compress_text(input, Mode::Full);
assert!(!r.output.contains("just"));
assert!(!r.output.contains("really"));
assert!(!r.output.contains("basically"));
}
#[test]
fn pleasantries_removed() {
let input = "I'd be happy to help you with that, of course.\n";
let r = compress_text(input, Mode::Full);
assert!(!r.output.to_lowercase().contains("happy to"));
assert!(!r.output.to_lowercase().contains("of course"));
}
#[test]
fn ultra_substitutes_with() {
let input = "Configure the app with these parameters.\n";
let r = compress_text(input, Mode::Ultra);
assert!(r.output.contains("w/"));
assert!(r.output.contains("param"));
}
#[test]
fn ultra_does_not_touch_code_block() {
let input = "Configure with these.\n```\nfn with_config() {}\n```\n";
let r = compress_text(input, Mode::Ultra);
assert!(r.output.contains("fn with_config() {}"));
}
#[test]
fn inline_code_preserved() {
let input = "Use `cargo build --release` to compile the binary.\n";
let r = compress_text(input, Mode::Full);
assert!(r.output.contains("`cargo build --release`"));
}
#[test]
fn table_preserved_verbatim() {
let input = "Intro.\n\n| col1 | col2 |\n|------|------|\n| a | b |\n\nOutro.\n";
let r = compress_text(input, Mode::Full);
assert!(r.output.contains("| col1 | col2 |"));
assert!(r.output.contains("| a | b |"));
}
#[test]
fn safe_false_when_url_dropped() {
let mut s = Stats::default();
s.orig_urls = 3;
s.new_urls = 2;
assert!(s.new_urls < s.orig_urls);
}
#[test]
fn integrity_check_on_real_input() {
let input = "# Title\n\nprose with the link [example](https://example.com).\n\n```\ncode\n```\n";
let r = compress_text(input, Mode::Full);
assert!(r.safe);
assert_eq!(r.stats.orig_headings, r.stats.new_headings);
assert_eq!(r.stats.orig_code_blocks, r.stats.new_code_blocks);
assert!(r.stats.new_urls >= r.stats.orig_urls);
}
#[test]
fn idempotent_on_already_compressed() {
let input = "# Title\n\nshort terse content.\n";
let r1 = compress_text(input, Mode::Full);
let r2 = compress_text(&r1.output, Mode::Full);
assert!(r2.safe);
}
#[test]
fn list_marker_preserved() {
let input = "- the first item\n- the second item\n";
let r = compress_text(input, Mode::Full);
assert!(r.output.starts_with("- "));
assert!(!r.output.contains("the first"));
}
}