Skip to main content

difflore_core/observability/
observation.rs

1//! `PostToolUse` observation classifier.
2//!
3//! Third supply line for candidate rules. When the Claude Code
4//! `PostToolUse` hook fires for an Edit / `MultiEdit` / Write tool, the
5//! CLI calls [`classify`] to turn the raw event into a structured
6//! [`Observation`] and enqueues it via `OutboxQueue` with
7//! `kind="observation"`. The cloud consumer clusters those rows by
8//! `content_hash` and feeds the rule-promoter alongside `remember_rule`
9//! captures and GitHub-App PR-merge signatures.
10//!
11//! Classification is deterministic and keyword-driven — no LLM call,
12//! sub-millisecond target. The heuristics are intentionally simple:
13//!
14//!   * `Write` of a brand-new file ⇒ `feature`
15//!   * Edit that strips a visible `FIXME` / `BUG` / `TODO` ⇒ `bugfix`
16//!   * Edit where the diff is whitespace-only (no semantic deltas) ⇒
17//!     `refactor`
18//!   * Anything else ⇒ `change`
19//!
20//! `discovery` and `decision` are declared as valid `obs_type` values
21//! for forward-compat but are never emitted from the local classifier
22//! (they need LLM or conversation context).
23//!
24//! Privacy guard: edits touching secret-bearing paths (`.env*`,
25//! `*.secrets*`, `*.key`, `*.pem`, `id_rsa*`, `credentials*`) are
26//! dropped *before* classification. The user cannot opt in — these
27//! files must never leave the local machine via the observation
28//! channel.
29
30use sha2::{Digest, Sha256};
31
32pub use crate::cloud::api_types::{Observation, ObservationScope};
33use crate::observability::privacy::strip_private_tagged_regions;
34
35/// Input payload for [`classify`]. Borrowed so the caller doesn't
36/// have to clone every string coming out of the hook event; the
37/// classifier only needs read access.
38#[derive(Debug, Clone, Copy)]
39pub struct ClassifyInput<'a> {
40    /// Tool name as reported by the hook adapter: `"Edit" |
41    /// "MultiEdit" | "Write"`. Any other tool returns `None`.
42    pub tool: &'a str,
43    /// Target file path. `None` short-circuits the classifier.
44    pub file_path: Option<&'a str>,
45    /// Adapter-synthesised unified-ish diff (e.g. `-old\n+new\n`
46    /// lines). Used for whitespace-only detection.
47    pub diff: Option<&'a str>,
48    /// Raw post-edit text (`new_string` / content). For Edit and
49    /// `MultiEdit` this is the replacement text.
50    pub new_text: Option<&'a str>,
51    /// Raw pre-edit text (`old_string`). `None` for Write events.
52    pub old_text: Option<&'a str>,
53    /// Platform session id from the hook stdin payload. Empty
54    /// string when unknown.
55    pub session_id: Option<&'a str>,
56    /// Optional timestamp override (mainly for tests). `None`
57    /// falls back to `SystemTime::now()`.
58    pub ts_ms: Option<i64>,
59}
60
61/// Maximum size of the diff excerpt captured in the observation
62/// payload. The cloud side does its own heavier clustering; we ship
63/// just enough context for a human reviewer to recognise the edit.
64pub const DIFF_EXCERPT_MAX_BYTES: usize = 1024;
65
66/// Hard title length cap. Matches the `title` doc comment on
67/// `Observation`.
68pub const TITLE_MAX_CHARS: usize = 120;
69
70/// Hard narrative length cap.
71pub const NARRATIVE_MAX_CHARS: usize = 500;
72
73/// Patterns that short-circuit classification. Hardcoded on purpose
74/// — the user has no way to disable this guard from config.
75const PRIVACY_DENY_SUBSTRINGS: &[&str] =
76    &[".env", ".secrets", ".key", ".pem", "id_rsa", "credentials"];
77
78/// Classify a `PostToolUse` event. Returns `None` when the event should
79/// not produce an observation (non-edit tool, no file path, missing
80/// diff signal, or a privacy-denied path).
81pub fn classify(input: &ClassifyInput<'_>) -> Option<Observation> {
82    // Only file-mutating tools produce observations.
83    if !matches!(input.tool, "Edit" | "MultiEdit" | "Write") {
84        return None;
85    }
86
87    let file_path = input.file_path?;
88    if is_privacy_denied(file_path) {
89        return None;
90    }
91
92    // At least one of diff / new_text must be present; otherwise the
93    // classifier has nothing to key off.
94    if input.diff.is_none() && input.new_text.is_none() {
95        return None;
96    }
97
98    let obs_type = determine_obs_type(input);
99    let title = build_title(input.tool, file_path, &obs_type);
100    let narrative = build_narrative(input);
101    let diff_excerpt = input
102        .diff
103        .map(strip_private_tagged_regions)
104        .map(|diff| truncate_diff_excerpt(&diff));
105
106    let session_id = input.session_id.unwrap_or("").to_owned();
107    let ts_ms = input.ts_ms.unwrap_or_else(now_unix_ms);
108    let content_hash =
109        compute_content_hash(&session_id, Some(file_path), &title, narrative.as_deref());
110
111    Some(Observation {
112        session_id,
113        ts_ms,
114        obs_type,
115        tool: input.tool.to_owned(),
116        file_path: Some(file_path.to_owned()),
117        scope: derive_scope(file_path),
118        title,
119        narrative,
120        diff_excerpt,
121        content_hash,
122    })
123}
124
125/// Heuristic core. Order matters: we check the most specific
126/// patterns first and fall through to the generic `change` label.
127fn determine_obs_type(input: &ClassifyInput<'_>) -> String {
128    // Write of a new file → feature. Claude Code's Write tool only
129    // has new_text (no old_text), which is exactly the shape we use
130    // here. MultiEdit / Edit always carry old_text so they can't
131    // trip this branch.
132    if input.tool == "Write" && input.old_text.is_none() {
133        return "feature".to_owned();
134    }
135
136    // Bugfix: a comment-removal pattern. We look at the old_text
137    // only — removing a bug-marker comment counts even when the
138    // replacement still has other comments.
139    if let Some(old) = input.old_text
140        && removes_bug_marker(old, input.new_text.unwrap_or(""))
141    {
142        return "bugfix".to_owned();
143    }
144
145    // Refactor: whitespace-only diff. Strip whitespace from both
146    // sides of every changed hunk and check that they're equal.
147    if let Some(diff) = input.diff {
148        if diff_is_whitespace_only(diff) {
149            return "refactor".to_owned();
150        }
151    } else if let (Some(old), Some(new)) = (input.old_text, input.new_text)
152        && strip_ws(old) == strip_ws(new)
153        && old != new
154    {
155        return "refactor".to_owned();
156    }
157
158    "change".to_owned()
159}
160
161/// `true` when the old text contains a visible bug marker (FIXME /
162/// BUG / TODO) and the new text no longer contains that same
163/// marker. Uppercase-only match — lowercase `todo` in code would
164/// produce way too many false positives.
165///
166/// Counts only standalone occurrences (alphanumeric/underscore
167/// neighbours rule them out). That keeps `DEBUG`/`debugger` from
168/// firing the `BUG` rule, which silently mislabelled refactors as
169/// bugfixes whenever a `DEBUG` line was removed.
170fn removes_bug_marker(old: &str, new: &str) -> bool {
171    const MARKERS: &[&str] = &["FIXME", "BUG", "TODO"];
172    for marker in MARKERS {
173        let before = count_word_occurrences(old, marker);
174        let after = count_word_occurrences(new, marker);
175        if before > after {
176            return true;
177        }
178    }
179    false
180}
181
182fn count_word_occurrences(haystack: &str, needle: &str) -> usize {
183    if needle.is_empty() {
184        return 0;
185    }
186    let bytes = haystack.as_bytes();
187    let nbytes = needle.as_bytes();
188    let mut count = 0;
189    let mut i = 0;
190    while i + nbytes.len() <= bytes.len() {
191        if &bytes[i..i + nbytes.len()] == nbytes {
192            let prev_ok = i == 0 || !is_word_byte(bytes[i - 1]);
193            let next_ok = i + nbytes.len() == bytes.len() || !is_word_byte(bytes[i + nbytes.len()]);
194            if prev_ok && next_ok {
195                count += 1;
196                i += nbytes.len();
197                continue;
198            }
199        }
200        i += 1;
201    }
202    count
203}
204
205const fn is_word_byte(b: u8) -> bool {
206    b.is_ascii_alphanumeric() || b == b'_'
207}
208
209/// `true` iff every `-` / `+` line in the diff has the same content
210/// after stripping whitespace. Lines that start with neither `-`
211/// nor `+` are ignored (context lines).
212fn diff_is_whitespace_only(diff: &str) -> bool {
213    let mut removed = String::new();
214    let mut added = String::new();
215    let mut saw_change = false;
216    for line in diff.lines() {
217        if let Some(rest) = line.strip_prefix('-') {
218            saw_change = true;
219            removed.push_str(rest);
220            removed.push('\n');
221        } else if let Some(rest) = line.strip_prefix('+') {
222            saw_change = true;
223            added.push_str(rest);
224            added.push('\n');
225        }
226    }
227    if !saw_change {
228        return false;
229    }
230    strip_ws(&removed) == strip_ws(&added)
231}
232
233/// Remove every ASCII whitespace character. Cheap and good enough
234/// for the refactor heuristic — reorderings would slip past, but so
235/// would a hand-written `rustfmt` tweak, which is the whole point.
236fn strip_ws(s: &str) -> String {
237    s.chars().filter(|c| !c.is_whitespace()).collect()
238}
239
240/// Build a ≤ 120-char title. Shape: `"{tool} {file}: {hint}"` where
241/// the hint is derived from the `obs_type`. Truncation appends `"…"`.
242fn build_title(tool: &str, file_path: &str, obs_type: &str) -> String {
243    let hint = match obs_type {
244        "feature" => "new file",
245        "bugfix" => "remove bug marker",
246        "refactor" => "whitespace/rename",
247        _ => "edit",
248    };
249    let base = format!("{tool} {file_path}: {hint}");
250    truncate_chars(&base, TITLE_MAX_CHARS)
251}
252
253/// Build a ≤ 500-char narrative. We concatenate the first few diff
254/// lines so the cloud-side rule-promoter has something to display
255/// without loading the full payload.
256fn build_narrative(input: &ClassifyInput<'_>) -> Option<String> {
257    let diff = strip_private_tagged_regions(input.diff?);
258    let mut collected = String::new();
259    for line in diff.lines().take(6) {
260        if !collected.is_empty() {
261            collected.push('\n');
262        }
263        collected.push_str(line);
264    }
265    if collected.is_empty() {
266        return None;
267    }
268    Some(truncate_chars(&collected, NARRATIVE_MAX_CHARS))
269}
270
271/// Truncate at a char boundary (not byte boundary — matters for
272/// UTF-8 inputs). Appends "…" when truncated.
273fn truncate_chars(s: &str, max_chars: usize) -> String {
274    if s.chars().count() <= max_chars {
275        return s.to_owned();
276    }
277    let mut out: String = s.chars().take(max_chars.saturating_sub(1)).collect();
278    out.push('…');
279    out
280}
281
282/// Byte-level truncation for the diff excerpt. Diffs can be huge;
283/// we stash the first `DIFF_EXCERPT_MAX_BYTES` bytes plus a marker.
284fn truncate_diff_excerpt(diff: &str) -> String {
285    if diff.len() <= DIFF_EXCERPT_MAX_BYTES {
286        return diff.to_owned();
287    }
288    // Find the largest char boundary ≤ max so we don't split a
289    // multi-byte codepoint.
290    let mut end = DIFF_EXCERPT_MAX_BYTES;
291    while end > 0 && !diff.is_char_boundary(end) {
292        end -= 1;
293    }
294    let mut out = String::with_capacity(end + 16);
295    out.push_str(&diff[..end]);
296    out.push_str("\n…[truncated]");
297    out
298}
299
300/// `sha256(session_id|file|title|narrative)[:16]` as lowercase hex.
301/// Mirrors the 16-char convention used by `remember_rule` content
302/// hashes (skills.rs). Sixty-four bits is plenty for cloud-side
303/// dedup inside a per-user corpus.
304pub(crate) fn compute_content_hash(
305    session_id: &str,
306    file_path: Option<&str>,
307    title: &str,
308    narrative: Option<&str>,
309) -> String {
310    let mut hasher = Sha256::new();
311    hasher.update(session_id.as_bytes());
312    hasher.update(b"|");
313    hasher.update(file_path.unwrap_or("").as_bytes());
314    hasher.update(b"|");
315    hasher.update(title.as_bytes());
316    hasher.update(b"|");
317    hasher.update(narrative.unwrap_or("").as_bytes());
318    let digest = hasher.finalize();
319    let mut hex = String::with_capacity(16);
320    for byte in digest.iter().take(8) {
321        hex.push_str(&format!("{byte:02x}"));
322    }
323    hex
324}
325
326/// `true` when the path matches one of the hardcoded secret
327/// patterns. Substring match (not full-glob) — good enough to cover
328/// `src/config/.env.local` and `infra/prod.credentials.json`.
329pub fn is_privacy_denied(path: &str) -> bool {
330    let lower = path.to_ascii_lowercase();
331    // `.pem`, `.key`, `.secrets` should match as file extensions or
332    // embedded tokens; checking lowercase substring handles both.
333    // The full list is small and uppercase-insensitive.
334    PRIVACY_DENY_SUBSTRINGS
335        .iter()
336        .any(|needle| lower.contains(needle))
337}
338
339fn derive_scope(file_path: &str) -> Option<ObservationScope> {
340    let trimmed = file_path.trim_matches('/');
341    if trimmed.is_empty() {
342        return None;
343    }
344
345    let parts: Vec<&str> = trimmed.split('/').filter(|part| !part.is_empty()).collect();
346    if parts.is_empty() {
347        return None;
348    }
349
350    let display_name = parts.last().map(|part| (*part).to_owned());
351    let parent_path = if parts.len() > 1 {
352        Some(parts[..parts.len() - 1].join("/"))
353    } else {
354        None
355    };
356
357    Some(ObservationScope {
358        anchor_kind: "file".to_owned(),
359        anchor_key: parts.join("/"),
360        parent_path,
361        display_name,
362    })
363}
364
365fn now_unix_ms() -> i64 {
366    use std::time::{SystemTime, UNIX_EPOCH};
367    SystemTime::now()
368        .duration_since(UNIX_EPOCH)
369        .map_or(0, |d| d.as_millis() as i64)
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375
376    fn input<'a>(
377        tool: &'a str,
378        file: &'a str,
379        diff: Option<&'a str>,
380        new_text: Option<&'a str>,
381        old_text: Option<&'a str>,
382    ) -> ClassifyInput<'a> {
383        ClassifyInput {
384            tool,
385            file_path: Some(file),
386            diff,
387            new_text,
388            old_text,
389            session_id: Some("sess_test"),
390            ts_ms: Some(1_714_000_000_000),
391        }
392    }
393
394    #[test]
395    fn classify_write_new_file_returns_feature() {
396        // Claude Code's Write tool ships `content` but no
397        // `old_string`. The classifier must tag that shape as a
398        // new-file feature, not a generic change.
399        let inp = input(
400            "Write",
401            "src/new_mod.rs",
402            Some("+fn hello() {}\n"),
403            Some("fn hello() {}\n"),
404            None,
405        );
406        let obs = classify(&inp).expect("some");
407        assert_eq!(obs.obs_type, "feature");
408        assert_eq!(obs.tool, "Write");
409        assert_eq!(obs.file_path.as_deref(), Some("src/new_mod.rs"));
410        assert!(
411            obs.title.contains("Write"),
412            "title missing tool: {}",
413            obs.title
414        );
415    }
416
417    #[test]
418    fn classify_edit_removing_fixme_returns_bugfix() {
419        // An Edit that drops a visible FIXME comment from the old
420        // code must be tagged bugfix — this is the strongest local
421        // signal we have that the user just shipped a real fix.
422        let old = "// FIXME: panics on None\nfoo.unwrap();\n";
423        let new = "if let Some(x) = foo { use_x(x); }\n";
424        let diff =
425            "-// FIXME: panics on None\n-foo.unwrap();\n+if let Some(x) = foo { use_x(x); }\n";
426        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
427        let obs = classify(&inp).expect("some");
428        assert_eq!(obs.obs_type, "bugfix");
429    }
430
431    #[test]
432    fn classify_edit_whitespace_only_returns_refactor() {
433        // rustfmt-style tweak: same tokens, different whitespace.
434        // The classifier must tag these as refactor so the cloud
435        // rule-promoter doesn't treat them as semantic change
436        // candidates.
437        let old = "let x=1;let y=2;";
438        let new = "let x = 1;\nlet y = 2;";
439        let diff = "-let x=1;let y=2;\n+let x = 1;\n+let y = 2;\n";
440        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
441        let obs = classify(&inp).expect("some");
442        assert_eq!(obs.obs_type, "refactor");
443    }
444
445    #[test]
446    fn removing_debug_line_does_not_count_as_bug_marker_removal() {
447        // Regression: substring matching let "DEBUG" trigger the "BUG"
448        // marker — removing a `// DEBUG: …` line silently mislabelled
449        // a refactor as a bugfix. Word-boundary counting keeps
450        // BUG/DEBUG distinct.
451        let old = "// DEBUG: tracing\nlog::trace!(\"x={x}\");\n";
452        let new = "// (debug line removed)\n";
453        let diff = "-// DEBUG: tracing\n-log::trace!(\"x={x}\");\n+// (debug line removed)\n";
454        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
455        let obs = classify(&inp).expect("some");
456        assert_ne!(
457            obs.obs_type, "bugfix",
458            "DEBUG → empty must not be classified as a bugfix"
459        );
460    }
461
462    #[test]
463    fn classify_edit_default_returns_change() {
464        // A plain-Jane semantic edit with no bug-marker removal and
465        // non-trivial content change must fall through to the safe
466        // `change` default.
467        let old = "let x = 1;";
468        let new = "let x = compute_answer();";
469        let diff = "-let x = 1;\n+let x = compute_answer();\n";
470        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
471        let obs = classify(&inp).expect("some");
472        assert_eq!(obs.obs_type, "change");
473    }
474
475    #[test]
476    fn privacy_guard_blocks_env_files() {
477        // `.env.local` is the single most-common secret-bearing file
478        // an agent will touch. The guard must fire before classify
479        // returns so the observation never gets enqueued.
480        let inp = input(
481            "Write",
482            "src/app/.env.local",
483            Some("+SECRET=abc\n"),
484            Some("SECRET=abc\n"),
485            None,
486        );
487        assert!(classify(&inp).is_none());
488    }
489
490    #[test]
491    fn privacy_guard_allows_normal_source_files() {
492        // Sanity check: ordinary `.rs` paths must NOT be denied.
493        let inp = input(
494            "Write",
495            "src/foo.rs",
496            Some("+fn main() {}\n"),
497            Some("fn main() {}\n"),
498            None,
499        );
500        assert!(classify(&inp).is_some());
501    }
502
503    #[test]
504    fn privacy_guard_covers_pem_key_credentials() {
505        // Confirm every pattern in the hardcoded list triggers.
506        for path in &[
507            "config/.env",
508            "app.secrets.json",
509            "infra/prod.secrets.yaml",
510            "keys/server.key",
511            "certs/app.pem",
512            "home/user/.ssh/id_rsa",
513            "credentials.json",
514        ] {
515            assert!(is_privacy_denied(path), "expected deny for `{path}`");
516        }
517    }
518
519    #[test]
520    fn private_tagged_regions_are_redacted_from_observation_payload() {
521        let diff = "-safe\n+safe <private>token=abc</private>\n+done\n";
522        let inp = input(
523            "Edit",
524            "src/foo.rs",
525            Some(diff),
526            Some("safe done\n"),
527            Some("safe\n"),
528        );
529
530        let obs = classify(&inp).expect("some");
531
532        assert!(
533            obs.narrative
534                .as_deref()
535                .unwrap()
536                .contains("[redacted private content]")
537        );
538        assert!(
539            obs.diff_excerpt
540                .as_deref()
541                .unwrap()
542                .contains("[redacted private content]")
543        );
544        assert!(!obs.narrative.as_deref().unwrap().contains("token=abc"));
545        assert!(!obs.diff_excerpt.as_deref().unwrap().contains("token=abc"));
546    }
547
548    #[test]
549    fn content_hash_is_stable_and_file_sensitive() {
550        // Two observations from identical inputs produce the same hash
551        // (idempotent cloud insertion), but a file-path change breaks it.
552        let old = "let x = 1;";
553        let new = "let x = compute_answer();";
554        let diff = "-let x = 1;\n+let x = compute_answer();\n";
555        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
556        let a = classify(&inp).expect("some");
557        let b = classify(&inp).expect("some");
558        assert_eq!(a.content_hash, b.content_hash);
559        assert_eq!(a.content_hash.len(), 16);
560
561        let other = classify(&input("Edit", "b.rs", Some(diff), Some(new), Some(old))).unwrap();
562        assert_ne!(a.content_hash, other.content_hash);
563    }
564
565    #[test]
566    fn non_edit_tool_returns_none() {
567        // Read / Bash / anything-else-we-don't-recognise must
568        // short-circuit. The hook is responsible for filtering
569        // too, but defence in depth keeps noise out of the
570        // observation stream.
571        let inp = input("Read", "src/foo.rs", None, None, None);
572        assert!(classify(&inp).is_none());
573    }
574
575    #[test]
576    fn missing_diff_and_new_text_returns_none() {
577        // If the adapter couldn't reconstruct a diff AND didn't
578        // give us new_text, there's nothing to classify. We MUST
579        // return None rather than emit an observation with empty
580        // content.
581        let inp = input("Edit", "src/foo.rs", None, None, Some("old"));
582        assert!(classify(&inp).is_none());
583    }
584
585    #[test]
586    fn classify_emits_structured_scope_metadata() {
587        let old = "let x = 1;";
588        let new = "let x = compute_answer();";
589        let diff = "-let x = 1;\n+let x = compute_answer();\n";
590        let obs = classify(&input(
591            "Edit",
592            "src/auth/login/handler.rs",
593            Some(diff),
594            Some(new),
595            Some(old),
596        ))
597        .expect("some");
598
599        assert_eq!(
600            obs.scope,
601            Some(ObservationScope {
602                anchor_kind: "file".to_owned(),
603                anchor_key: "src/auth/login/handler.rs".to_owned(),
604                parent_path: Some("src/auth/login".to_owned()),
605                display_name: Some("handler.rs".to_owned()),
606            })
607        );
608    }
609
610    #[test]
611    fn wire_shape_accepts_optional_scope_metadata() {
612        let payload = serde_json::json!({
613            "session_id": "sess_new",
614            "ts_ms": 2,
615            "obs_type": "bugfix",
616            "tool": "Edit",
617            "file_path": "src/auth/login/handler.rs",
618            "scope": {
619                "anchor_kind": "file",
620                "anchor_key": "src/auth/login/handler.rs",
621                "parent_path": "src/auth/login",
622                "display_name": "handler.rs"
623            },
624            "title": "Edit src/auth/login/handler.rs: remove bug marker",
625            "narrative": "guard login retry state",
626            "diff_excerpt": "-old\n+new",
627            "content_hash": "def456"
628        });
629
630        let obs: Observation = serde_json::from_value(payload).expect("deserialize");
631        assert_eq!(
632            obs.scope.as_ref().map(|scope| scope.anchor_key.as_str()),
633            Some("src/auth/login/handler.rs")
634        );
635    }
636
637    /// Ignored-by-default helper that prints a wire-format example for
638    /// each `obs_type`. Run with `cargo test -p difflore-core --lib
639    /// observation::tests::print_wire_samples -- --ignored --nocapture`.
640    /// Emits stable payload shapes without re-deriving them.
641    #[test]
642    #[ignore = "doc helper for sample wire output, run manually"]
643    fn print_wire_samples() {
644        let samples = [
645            (
646                "feature",
647                input(
648                    "Write",
649                    "src/new_mod.rs",
650                    Some("+fn hello() {}\n+pub fn world() {}\n"),
651                    Some("fn hello() {}\npub fn world() {}\n"),
652                    None,
653                ),
654            ),
655            (
656                "bugfix",
657                input(
658                    "Edit",
659                    "src/foo.rs",
660                    Some(
661                        "-// FIXME: crash on None\n-foo.unwrap();\n+if let Some(x) = foo { use_x(x); }\n",
662                    ),
663                    Some("if let Some(x) = foo { use_x(x); }\n"),
664                    Some("// FIXME: crash on None\nfoo.unwrap();\n"),
665                ),
666            ),
667            (
668                "refactor",
669                input(
670                    "Edit",
671                    "src/foo.rs",
672                    Some("-let x=1;let y=2;\n+let x = 1;\n+let y = 2;\n"),
673                    Some("let x = 1;\nlet y = 2;"),
674                    Some("let x=1;let y=2;"),
675                ),
676            ),
677            (
678                "change",
679                input(
680                    "Edit",
681                    "src/foo.rs",
682                    Some("-let x = 1;\n+let x = compute_answer();\n"),
683                    Some("let x = compute_answer();"),
684                    Some("let x = 1;"),
685                ),
686            ),
687        ];
688        for (label, inp) in samples {
689            let obs = classify(&inp).expect("some");
690            let json = serde_json::to_string_pretty(&obs).unwrap();
691            println!("=== {label} ===\n{json}\n");
692        }
693    }
694
695    #[test]
696    fn diff_excerpt_truncates_large_diffs() {
697        // Build a 4 KB synthetic diff and confirm the excerpt caps
698        // at roughly DIFF_EXCERPT_MAX_BYTES (plus the truncation
699        // marker).
700        let big: String = (0..4096).map(|_| 'x').collect();
701        let diff = format!("-{big}\n+{big}Y\n");
702        let inp = input("Edit", "src/foo.rs", Some(&diff), Some("yYY"), Some("xxx"));
703        let obs = classify(&inp).expect("some");
704        let excerpt = obs.diff_excerpt.expect("excerpt present");
705        assert!(
706            excerpt.len() <= DIFF_EXCERPT_MAX_BYTES + 32,
707            "excerpt too long: {}",
708            excerpt.len()
709        );
710        assert!(excerpt.ends_with("[truncated]"));
711    }
712}