basemind 0.8.0

Full AI context layer over MCP — tree-sitter code-map, document RAG (PDF/Office/HTML/email + OCR + reranker), shared agent memory, on-demand web crawl, git history + blame + per-symbol diff. 300+ languages, 10+ coding-agent harnesses, content-addressed Fjall + LanceDB.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
//! Behavioral output compression (token-reduction workstream W6).
//!
//! Compress verbose command output into a compact summary so an agent's tool
//! results cost far fewer tokens — while NEVER dropping errors or secrets.
//!
//! The single public entry point is [`compress_output`]. Given raw command
//! output (and an optional family hint), it:
//!
//! 1. Strips ANSI escape codes (keeping visible text).
//! 2. Fails open — returns the raw input unchanged with `compressed = false` —
//!    when the output shows the command errored, or when no family handler can
//!    save at least [`MIN_SAVINGS_RATIO`].
//! 3. Otherwise runs the family handler, then re-injects any credential- or
//!    error-bearing line a handler would have dropped, so secrets and error
//!    detail always survive.
//!
//! Ported from the alexgreensh/token-optimizer `bash_compress.py` reference,
//! adapted to a pure read-only `text -> text` transform (this tool never runs
//! the command — it only sees output).

pub mod checkpoint;
pub mod cli;
pub mod delta;
mod detect;
mod handlers;
mod safety;
pub mod waste;

pub use detect::Family;

/// Result of a [`compress_output`] call.
///
/// `output` is the text to surface to the agent: the compact summary when
/// `compressed` is `true`, or the (ANSI-stripped) raw input when it is `false`.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CompressionOutcome {
    /// The text to emit downstream.
    pub output: String,
    /// Byte length of the original input (after ANSI stripping).
    pub original_bytes: usize,
    /// Byte length of [`output`](Self::output).
    pub compressed_bytes: usize,
    /// The family that was used (detected or supplied), as a stable lowercase string.
    pub family_detected: String,
    /// Whether compression was applied. `false` means the raw input was returned.
    pub compressed: bool,
}

/// Minimum byte-savings ratio required to accept a compression. Below this we
/// fail open and return the raw input — the risk of dropping a meaningful line
/// is not worth a marginal token win.
pub const MIN_SAVINGS_RATIO: f64 = 0.10;

/// Inputs below this byte length are never worth compressing; pass through.
const MIN_INPUT_BYTES: usize = 100;

/// Cap on re-injected preserved lines, so output where most lines carry a
/// secret (e.g. grep for keys) does not defeat compression entirely.
const MAX_REINJECTED: usize = 32;

/// Compress `text` for a command `family`. When `family` is `None`, the family
/// is sniffed from the output shape (see [`detect`]).
///
/// Fail-open guarantees (the security-critical contract):
/// - errored output (error / fatal / exception / traceback markers) is returned raw;
/// - any line carrying a credential or error marker is preserved verbatim;
/// - a compression saving less than [`MIN_SAVINGS_RATIO`] is discarded.
pub fn compress_output(text: &str, family: Option<&str>) -> CompressionOutcome {
    // Always strip ANSI first — it is safe and lets every downstream scan see
    // the visible text (including any credential hidden in a colour sequence or
    // hyperlink label).
    let cleaned = safety::strip_ansi(text);
    let original_bytes = cleaned.len();

    // Resolve the family up front so we can always report it, even on a pass-through.
    let resolved = match family {
        Some(name) => detect::Family::parse(name).unwrap_or_else(|| detect::detect(&cleaned)),
        None => detect::detect(&cleaned),
    };
    let family_name = resolved.as_str().to_string();

    // Helper to build a fail-open (raw) outcome.
    let pass_through = |out: String| {
        let len = out.len();
        CompressionOutcome {
            output: out,
            original_bytes,
            compressed_bytes: len,
            family_detected: family_name.clone(),
            compressed: false,
        }
    };

    // OSC8 strip destroys the hyperlink *target* (only the visible label is
    // kept). A credential hidden in the URI — e.g.
    // `\x1b]8;;postgres://u:SECRET@h\x07ok\x1b]8;;\x07` — is therefore gone from
    // `cleaned` before any credential scan runs. Detect this by pre-scanning the
    // RAW pre-strip input: if the raw text carried a credential that did not
    // survive stripping, the strip destroyed a secret → fail open and return the
    // RAW input verbatim so the secret is never silently discarded.
    if safety::contains_credential(text) && !safety::contains_credential(&cleaned) {
        return pass_through(text.to_string());
    }

    // Too small to bother.
    if original_bytes < MIN_INPUT_BYTES {
        return pass_through(cleaned);
    }

    // Fail open on errored output: never compress away debugging signal.
    if safety::looks_like_failure(&cleaned) {
        return pass_through(cleaned);
    }

    // Run the handler. Handlers are pure and total; there is no panic path, but
    // even a no-op result is handled by the savings gate below.
    let mut compressed = handlers::compress(resolved, &cleaned);

    // Re-inject preserved (credential / error) lines a handler dropped. Exact
    // line membership (not substring) so a short secret line contained inside a
    // longer summary line is still surfaced on its own.
    let preserved = safety::preserved_line_indices(&cleaned);
    if !preserved.is_empty() {
        match reinject_preserved(&cleaned, &compressed, &preserved) {
            // All preserved lines fit under the cap; use the augmented output.
            Reinjection::Complete(out) => compressed = out,
            // More preserved (credential/error-bearing) lines than the cap can
            // hold would have to be truncated. Truncating could drop secrets
            // 33+ while still reporting `compressed = true`, so fail open and
            // return the raw input — every preserved line survives verbatim.
            Reinjection::Overflowed => return pass_through(cleaned),
        }
    }

    // Final backstop: if a credential is present anywhere in the input but did
    // not survive into the compressed output (e.g. it shared a line that was
    // restructured), refuse to compress and hand back the raw input.
    if safety::contains_credential(&cleaned) && !safety::contains_credential(&compressed) {
        return pass_through(cleaned);
    }

    // Savings gate: discard a compression that does not save enough.
    let compressed_bytes = compressed.len();
    let saved = if original_bytes == 0 {
        0.0
    } else {
        1.0 - (compressed_bytes as f64 / original_bytes as f64)
    };
    if saved < MIN_SAVINGS_RATIO {
        return pass_through(cleaned);
    }

    CompressionOutcome {
        output: compressed,
        original_bytes,
        compressed_bytes,
        family_detected: family_name,
        compressed: true,
    }
}

/// Outcome of re-injecting preserved lines into a handler's output.
enum Reinjection {
    /// Every preserved line not already present was appended (within the cap).
    Complete(String),
    /// More droppable preserved lines exist than [`MAX_REINJECTED`] allows;
    /// the caller must fail open rather than truncate and lose secrets.
    Overflowed,
}

/// Append any preserved line (by original index) that is not already present as
/// an exact line in `compressed`. If the number of lines that would need
/// appending exceeds [`MAX_REINJECTED`], return [`Reinjection::Overflowed`] so
/// the caller fails open — truncating preserved lines could silently drop a
/// secret while still reporting a successful compression.
fn reinject_preserved(
    cleaned: &str,
    compressed: &str,
    preserved: &ahash::AHashSet<usize>,
) -> Reinjection {
    let original_lines: Vec<&str> = cleaned.lines().collect();
    let mut existing: ahash::AHashSet<&str> = compressed.lines().collect();

    // Deterministic order: walk preserved indices in source order.
    let mut sorted: Vec<usize> = preserved.iter().copied().collect();
    sorted.sort_unstable();

    let mut appended: Vec<String> = Vec::new();
    for idx in sorted {
        if let Some(line) = original_lines.get(idx)
            && !existing.contains(line)
        {
            // One more preserved line beyond the cap: do not truncate, fail open.
            if appended.len() >= MAX_REINJECTED {
                return Reinjection::Overflowed;
            }
            appended.push((*line).to_string());
            existing.insert(line);
        }
    }

    if appended.is_empty() {
        return Reinjection::Complete(compressed.to_string());
    }
    Reinjection::Complete(format!("{compressed}\n{}", appended.join("\n")))
}

#[cfg(test)]
mod tests {
    use super::*;

    // --- per-family shrink tests -------------------------------------------

    #[test]
    fn git_status_shrinks() {
        let input = "On branch main\nYour branch is up to date with 'origin/main'.\n\n\
                     Changes not staged for commit:\n  (use \"git add ...\")\n\
                     \tmodified:   src/lib.rs\n\tmodified:   src/main.rs\n\
                     \tmodified:   Cargo.toml\n\nUntracked files:\n  (use \"git add ...\")\n\
                     \tnewfile.rs\n\tother.rs\n\nno changes added to commit\n";
        let r = compress_output(input, Some("git_status"));
        assert!(r.compressed, "git status should compress: {:?}", r.output);
        assert!(r.compressed_bytes < r.original_bytes);
        assert!(r.output.contains("branch: main"));
        assert!(r.output.contains("unstaged"));
    }

    #[test]
    fn git_log_shrinks() {
        let mut input = String::new();
        for n in 0..30 {
            input.push_str(&format!(
                "commit {:040x}\ngpg: Signature made ...\ngpg: Good signature\n\
                 Author: Dev <d@x>\nDate:   today\n\n    message {n}\n\n",
                n
            ));
        }
        let r = compress_output(&input, Some("git_log"));
        assert!(r.compressed, "git log should compress");
        assert!(!r.output.contains("gpg:"), "gpg lines should be dropped");
    }

    #[test]
    fn git_diff_shrinks() {
        let mut input = String::from("diff --git a/x b/x\n--- a/x\n+++ b/x\n");
        for n in 0..200 {
            input.push_str(&format!("+added line {n}\n"));
        }
        let r = compress_output(&input, Some("git_diff"));
        assert!(r.compressed, "large diff should compress");
        assert!(r.output.contains("more lines"));
    }

    #[test]
    fn npm_install_shrinks() {
        let mut input = String::new();
        for n in 0..60 {
            input.push_str(&format!(
                "npm http fetch GET 200 https://registry/pkg{n} 12ms\n"
            ));
        }
        input.push_str("added 60 packages, and audited 61 packages in 3s\n");
        input.push_str("found 0 vulnerabilities\n");
        let r = compress_output(&input, Some("npm_install"));
        assert!(r.compressed, "npm install should compress");
        assert!(r.output.contains("audited"));
        assert!(!r.output.contains("npm http fetch"));
    }

    #[test]
    fn cargo_build_shrinks() {
        let mut input = String::new();
        for n in 0..60 {
            input.push_str(&format!("   Compiling crate{n} v0.1.0\n"));
        }
        input.push_str("warning: unused variable: `x`\n");
        input.push_str("    Finished dev [unoptimized] target(s) in 12.3s\n");
        let r = compress_output(&input, Some("cargo_build"));
        assert!(r.compressed, "cargo build should compress");
        assert!(r.output.contains("warning"));
        assert!(r.output.contains("Finished"));
    }

    #[test]
    fn pytest_shrinks() {
        let mut input = String::from("============ test session starts ============\n");
        for n in 0..40 {
            input.push_str(&format!(
                "tests/test_mod.py::test_{n} PASSED                  [ {n}%]\n"
            ));
        }
        input.push_str(
            "===================== 39 passed, 1 skipped in 2.10s =====================\n",
        );
        let r = compress_output(&input, Some("pytest"));
        assert!(r.compressed, "pytest should compress: {:?}", r.output);
        assert!(r.output.contains("passed"));
    }

    #[test]
    fn ls_shrinks() {
        let mut input = String::new();
        for n in 0..120 {
            input.push_str(&format!("file_{n:04}.txt\n"));
        }
        let r = compress_output(&input, Some("ls"));
        assert!(r.compressed, "long listing should compress");
        assert!(r.output.contains("more entries"));
    }

    #[test]
    fn grep_shrinks() {
        let mut input = String::new();
        for f in 0..30 {
            for l in 0..10 {
                input.push_str(&format!(
                    "src/file{f}.rs:{}:    let value = compute();\n",
                    l + 1
                ));
            }
        }
        let r = compress_output(&input, Some("grep"));
        assert!(r.compressed, "grep output should compress");
        assert!(r.output.contains("matches in"));
        assert!(r.output.contains("more matches in"));
    }

    #[test]
    fn logs_shrinks() {
        let mut input = String::new();
        for _ in 0..50 {
            input.push_str("waiting for connection...\n");
        }
        let r = compress_output(&input, Some("logs"));
        assert!(r.compressed, "duplicate-heavy logs should compress");
        assert!(r.output.contains("(x50)"));
    }

    // --- fail-open tests ----------------------------------------------------

    #[test]
    fn fail_open_on_errored_output() {
        let mut input = String::from("On branch main\n");
        for n in 0..40 {
            input.push_str(&format!("\tmodified:   file{n}.rs\n"));
        }
        input.push_str("error: something went catastrophically wrong\n");
        let r = compress_output(&input, Some("git_status"));
        assert!(!r.compressed, "errored output must not compress");
        assert!(
            r.output
                .contains("error: something went catastrophically wrong")
        );
        assert_eq!(r.output, safety_strip(&input), "raw passthrough expected");
    }

    #[test]
    fn fail_open_on_low_savings() {
        // A short, already-compact git status (clean tree) compresses well, so
        // build an input whose handler output is nearly as large as the input:
        // a unique-line log that the logs handler cannot collapse.
        let mut input = String::new();
        for n in 0..40 {
            input.push_str(&format!(
                "unique log line number {n} with distinct content here\n"
            ));
        }
        let r = compress_output(&input, Some("logs"));
        assert!(!r.compressed, "no-savings output must pass through raw");
        assert_eq!(r.output, input.trim_end_matches('\n').to_string() + "\n");
    }

    // --- credential-preservation tests -------------------------------------

    #[test]
    fn preserves_aws_key_in_droppable_git_status() {
        // The AWS key is on an untracked-file line the handler would normally
        // fold into a comma-joined "untracked: ..." list. It must survive
        // verbatim.
        let mut input = String::from("On branch main\n\nUntracked files:\n  (use \"git add\")\n");
        for n in 0..40 {
            input.push_str(&format!("\tjunk_file_{n}.tmp\n"));
        }
        input.push_str("\tleaked_AKIAIOSFODNN7EXAMPLE_creds.txt\n");
        let r = compress_output(&input, Some("git_status"));
        assert!(
            r.output.contains("AKIAIOSFODNN7EXAMPLE"),
            "AWS key must survive compression, got: {}",
            r.output
        );
    }

    #[test]
    fn preserves_github_pat_in_logs() {
        let pat = format!("ghp_{}", "a".repeat(36));
        let mut input = String::new();
        for _ in 0..50 {
            input.push_str("connecting...\n");
        }
        input.push_str(&format!("auth header token {pat}\n"));
        let r = compress_output(&input, Some("logs"));
        assert!(
            r.output.contains(&pat),
            "GitHub PAT must survive: {}",
            r.output
        );
    }

    #[test]
    fn preserves_private_key_header() {
        let mut input = String::new();
        for n in 0..60 {
            input.push_str(&format!("file_{n}.pem\n"));
        }
        input.push_str("-----BEGIN RSA PRIVATE KEY-----\n");
        let r = compress_output(&input, Some("ls"));
        assert!(
            r.output.contains("-----BEGIN RSA PRIVATE KEY-----"),
            "private key header must survive: {}",
            r.output
        );
    }

    #[test]
    fn preserves_secret_in_osc8_hyperlink_uri() {
        // A credential lives in the OSC8 hyperlink *target* (URI), not the
        // visible label. Stripping ANSI keeps only the label "ok", destroying
        // the secret before any scan runs on `cleaned`. The raw pre-scan must
        // catch this and fail open, returning the RAW input verbatim.
        let secret_link =
            "\x1b]8;;postgres://admin:SECRETPASSWORD@db.internal/prod\x07ok\x1b]8;;\x07";
        let mut input = String::new();
        for _ in 0..30 {
            input.push_str("waiting for connection...\n");
        }
        input.push_str(secret_link);
        input.push('\n');

        let r = compress_output(&input, Some("logs"));
        assert!(
            !r.compressed,
            "OSC8-URI secret must fail open, not compress"
        );
        assert_eq!(r.output, input, "must return the RAW pre-strip input");
        assert!(
            r.output
                .contains("postgres://admin:SECRETPASSWORD@db.internal/prod"),
            "credential in the hyperlink URI must survive: {}",
            r.output
        );
    }

    #[test]
    fn npm_err_failures_fail_open_raw() {
        // 10 `npm ERR!` lines describe a fully-failed install. Without the
        // bang-form error detection these would be summarized to a single warn
        // line; the fix must fail open and return the raw input.
        let mut input = String::new();
        for n in 0..10 {
            input.push_str(&format!("npm ERR! line {n} install failed for dep{n}\n"));
        }
        let r = compress_output(&input, Some("npm_install"));
        assert!(
            !r.compressed,
            "npm ERR! output must fail open, not compress"
        );
        assert_eq!(
            r.output, input,
            "raw passthrough expected for npm ERR! block"
        );
        // Every failure line survives.
        for n in 0..10 {
            assert!(
                r.output.contains(&format!("npm ERR! line {n}")),
                "npm ERR! line {n} must survive: {}",
                r.output
            );
        }
    }

    #[test]
    fn fail_open_when_preserved_lines_exceed_cap() {
        // More than MAX_REINJECTED unique secret-bearing droppable lines: the
        // handler would drop them and the cap cannot re-inject all, so we must
        // fail open and return the raw input rather than truncate (and silently
        // lose secrets 33+).
        let count = MAX_REINJECTED + 5;
        // Build a long listing (ls family) where the secret-bearing entries sit
        // PAST the ls handler's kept head (LS_KEEP=50), so the handler drops
        // them and the scanner must re-inject more than the cap allows.
        let mut input = String::new();
        // Padding head the ls handler keeps verbatim, pushing the secrets out of
        // the kept window so they are genuinely dropped.
        for n in 0..80 {
            input.push_str(&format!("plain_file_{n:04}.txt\n"));
        }
        for n in 0..count {
            // AKIA + 16 uppercase hex chars; vary the tail so each is unique.
            input.push_str(&format!("file_AKIA{:016X}_creds.txt\n", n));
        }
        let r = compress_output(&input, Some("ls"));
        assert!(
            !r.compressed,
            "more than {MAX_REINJECTED} secret lines must fail open, not truncate"
        );
        // Every single secret-bearing line survives in the raw output.
        for n in 0..count {
            assert!(
                r.output.contains(&format!("AKIA{:016X}", n)),
                "secret #{n} must survive in raw output"
            );
        }
        // The misleading "more ... preserved in raw output" note is gone.
        assert!(
            !r.output.contains("preserved in raw output"),
            "misleading cap note must be removed: {}",
            r.output
        );
    }

    // Local mirror of the strip used by compress_output, for the raw-passthrough
    // equality assertion above.
    fn safety_strip(s: &str) -> String {
        super::safety::strip_ansi(s)
    }
}