difflore-core 0.1.0

//! `PostToolUse` observation classifier.
//!
//! Third supply line for candidate rules. When the Claude Code
//! `PostToolUse` hook fires for an Edit / `MultiEdit` / Write tool, the
//! CLI calls [`classify`] to turn the raw event into a structured
//! [`Observation`] and enqueues it via `OutboxQueue` with
//! `kind="observation"`. The cloud consumer clusters those rows by
//! `content_hash` and feeds the rule-promoter alongside `remember_rule`
//! captures and GitHub-App PR-merge signatures.
//!
//! Classification is deterministic and keyword-driven — no LLM call,
//! sub-millisecond target. The heuristics are intentionally simple:
//!
//!   * `Write` of a brand-new file ⇒ `feature`
//!   * Edit that strips a visible `FIXME` / `BUG` / `TODO` ⇒ `bugfix`
//!   * Edit where the diff is whitespace-only (no semantic deltas) ⇒
//!     `refactor`
//!   * Anything else ⇒ `change`
//!
//! `discovery` and `decision` are declared as valid `obs_type` values
//! for forward-compat but are never emitted from the local classifier
//! (they need LLM or conversation context).
//!
//! Privacy guard: edits touching secret-bearing paths (`.env*`,
//! `*.secrets*`, `*.key`, `*.pem`, `id_rsa*`, `credentials*`) are
//! dropped *before* classification. The user cannot opt in — these
//! files must never leave the local machine via the observation
//! channel.

use sha2::{Digest, Sha256};

pub use crate::cloud::api_types::{Observation, ObservationScope};
use crate::observability::privacy::strip_private_tagged_regions;

/// Input payload for [`classify`]. Borrowed so the caller doesn't
/// have to clone every string coming out of the hook event; the
/// classifier only needs read access.
#[derive(Debug, Clone, Copy)]
pub struct ClassifyInput<'a> {
    /// Tool name as reported by the hook adapter: `"Edit" |
    /// "MultiEdit" | "Write"`. Any other tool returns `None`.
    pub tool: &'a str,
    /// Target file path. `None` short-circuits the classifier.
    pub file_path: Option<&'a str>,
    /// Adapter-synthesised unified-ish diff (e.g. `-old\n+new\n`
    /// lines). Used for whitespace-only detection.
    pub diff: Option<&'a str>,
    /// Raw post-edit text (`new_string` / content). For Edit and
    /// `MultiEdit` this is the replacement text.
    pub new_text: Option<&'a str>,
    /// Raw pre-edit text (`old_string`). `None` for Write events.
    pub old_text: Option<&'a str>,
    /// Platform session id from the hook stdin payload. Empty
    /// string when unknown.
    pub session_id: Option<&'a str>,
    /// Optional timestamp override (mainly for tests). `None`
    /// falls back to `SystemTime::now()`.
    pub ts_ms: Option<i64>,
}

/// Maximum size of the diff excerpt captured in the observation
/// payload. The cloud side does its own heavier clustering; we ship
/// just enough context for a human reviewer to recognise the edit.
pub const DIFF_EXCERPT_MAX_BYTES: usize = 1024;

/// Hard title length cap. Matches the `title` doc comment on
/// `Observation`.
pub const TITLE_MAX_CHARS: usize = 120;

/// Hard narrative length cap.
pub const NARRATIVE_MAX_CHARS: usize = 500;

/// Patterns that short-circuit classification. Hardcoded on purpose
/// — the user has no way to disable this guard from config.
const PRIVACY_DENY_SUBSTRINGS: &[&str] =
    &[".env", ".secrets", ".key", ".pem", "id_rsa", "credentials"];

/// Classify a `PostToolUse` event. Returns `None` when the event should
/// not produce an observation (non-edit tool, no file path, missing
/// diff signal, or a privacy-denied path).
pub fn classify(input: &ClassifyInput<'_>) -> Option<Observation> {
    // Only file-mutating tools produce observations.
    if !matches!(input.tool, "Edit" | "MultiEdit" | "Write") {
        return None;
    }

    let file_path = input.file_path?;
    if is_privacy_denied(file_path) {
        return None;
    }

    // At least one of diff / new_text must be present; otherwise the
    // classifier has nothing to key off.
    if input.diff.is_none() && input.new_text.is_none() {
        return None;
    }

    let obs_type = determine_obs_type(input);
    let title = build_title(input.tool, file_path, &obs_type);
    let narrative = build_narrative(input);
    let diff_excerpt = input
        .diff
        .map(strip_private_tagged_regions)
        .map(|diff| truncate_diff_excerpt(&diff));

    let session_id = input.session_id.unwrap_or("").to_owned();
    let ts_ms = input.ts_ms.unwrap_or_else(now_unix_ms);
    let content_hash =
        compute_content_hash(&session_id, Some(file_path), &title, narrative.as_deref());

    Some(Observation {
        session_id,
        ts_ms,
        obs_type,
        tool: input.tool.to_owned(),
        file_path: Some(file_path.to_owned()),
        scope: derive_scope(file_path),
        title,
        narrative,
        diff_excerpt,
        content_hash,
    })
}

/// Heuristic core. Order matters: we check the most specific
/// patterns first and fall through to the generic `change` label.
fn determine_obs_type(input: &ClassifyInput<'_>) -> String {
    // Write of a new file → feature. Claude Code's Write tool only
    // has new_text (no old_text), which is exactly the shape we use
    // here. MultiEdit / Edit always carry old_text so they can't
    // trip this branch.
    if input.tool == "Write" && input.old_text.is_none() {
        return "feature".to_owned();
    }

    // Bugfix: a comment-removal pattern. We look at the old_text
    // only — removing a bug-marker comment counts even when the
    // replacement still has other comments.
    if let Some(old) = input.old_text
        && removes_bug_marker(old, input.new_text.unwrap_or(""))
    {
        return "bugfix".to_owned();
    }

    // Refactor: whitespace-only diff. Strip whitespace from both
    // sides of every changed hunk and check that they're equal.
    if let Some(diff) = input.diff {
        if diff_is_whitespace_only(diff) {
            return "refactor".to_owned();
        }
    } else if let (Some(old), Some(new)) = (input.old_text, input.new_text)
        && strip_ws(old) == strip_ws(new)
        && old != new
    {
        return "refactor".to_owned();
    }

    "change".to_owned()
}

/// `true` when the old text contains a visible bug marker (FIXME /
/// BUG / TODO) and the new text no longer contains that same
/// marker. Uppercase-only match — lowercase `todo` in code would
/// produce way too many false positives.
///
/// Counts only standalone occurrences (alphanumeric/underscore
/// neighbours rule them out). That keeps `DEBUG`/`debugger` from
/// firing the `BUG` rule, which silently mislabelled refactors as
/// bugfixes whenever a `DEBUG` line was removed.
fn removes_bug_marker(old: &str, new: &str) -> bool {
    const MARKERS: &[&str] = &["FIXME", "BUG", "TODO"];
    for marker in MARKERS {
        let before = count_word_occurrences(old, marker);
        let after = count_word_occurrences(new, marker);
        if before > after {
            return true;
        }
    }
    false
}

fn count_word_occurrences(haystack: &str, needle: &str) -> usize {
    if needle.is_empty() {
        return 0;
    }
    let bytes = haystack.as_bytes();
    let nbytes = needle.as_bytes();
    let mut count = 0;
    let mut i = 0;
    while i + nbytes.len() <= bytes.len() {
        if &bytes[i..i + nbytes.len()] == nbytes {
            let prev_ok = i == 0 || !is_word_byte(bytes[i - 1]);
            let next_ok = i + nbytes.len() == bytes.len() || !is_word_byte(bytes[i + nbytes.len()]);
            if prev_ok && next_ok {
                count += 1;
                i += nbytes.len();
                continue;
            }
        }
        i += 1;
    }
    count
}

const fn is_word_byte(b: u8) -> bool {
    b.is_ascii_alphanumeric() || b == b'_'
}

/// `true` iff every `-` / `+` line in the diff has the same content
/// after stripping whitespace. Lines that start with neither `-`
/// nor `+` are ignored (context lines).
fn diff_is_whitespace_only(diff: &str) -> bool {
    let mut removed = String::new();
    let mut added = String::new();
    let mut saw_change = false;
    for line in diff.lines() {
        if let Some(rest) = line.strip_prefix('-') {
            saw_change = true;
            removed.push_str(rest);
            removed.push('\n');
        } else if let Some(rest) = line.strip_prefix('+') {
            saw_change = true;
            added.push_str(rest);
            added.push('\n');
        }
    }
    if !saw_change {
        return false;
    }
    strip_ws(&removed) == strip_ws(&added)
}

/// Remove every ASCII whitespace character. Cheap and good enough
/// for the refactor heuristic — reorderings would slip past, but so
/// would a hand-written `rustfmt` tweak, which is the whole point.
fn strip_ws(s: &str) -> String {
    s.chars().filter(|c| !c.is_whitespace()).collect()
}

/// Build a ≤ 120-char title. Shape: `"{tool} {file}: {hint}"` where
/// the hint is derived from the `obs_type`. Truncation appends `"…"`.
fn build_title(tool: &str, file_path: &str, obs_type: &str) -> String {
    let hint = match obs_type {
        "feature" => "new file",
        "bugfix" => "remove bug marker",
        "refactor" => "whitespace/rename",
        _ => "edit",
    };
    let base = format!("{tool} {file_path}: {hint}");
    truncate_chars(&base, TITLE_MAX_CHARS)
}

/// Build a ≤ 500-char narrative. We concatenate the first few diff
/// lines so the cloud-side rule-promoter has something to display
/// without loading the full payload.
fn build_narrative(input: &ClassifyInput<'_>) -> Option<String> {
    let diff = strip_private_tagged_regions(input.diff?);
    let mut collected = String::new();
    for line in diff.lines().take(6) {
        if !collected.is_empty() {
            collected.push('\n');
        }
        collected.push_str(line);
    }
    if collected.is_empty() {
        return None;
    }
    Some(truncate_chars(&collected, NARRATIVE_MAX_CHARS))
}

/// Truncate at a char boundary (not byte boundary — matters for
/// UTF-8 inputs). Appends "…" when truncated.
fn truncate_chars(s: &str, max_chars: usize) -> String {
    if s.chars().count() <= max_chars {
        return s.to_owned();
    }
    let mut out: String = s.chars().take(max_chars.saturating_sub(1)).collect();
    out.push('…');
    out
}

/// Byte-level truncation for the diff excerpt. Diffs can be huge;
/// we stash the first `DIFF_EXCERPT_MAX_BYTES` bytes plus a marker.
fn truncate_diff_excerpt(diff: &str) -> String {
    if diff.len() <= DIFF_EXCERPT_MAX_BYTES {
        return diff.to_owned();
    }
    // Find the largest char boundary ≤ max so we don't split a
    // multi-byte codepoint.
    let mut end = DIFF_EXCERPT_MAX_BYTES;
    while end > 0 && !diff.is_char_boundary(end) {
        end -= 1;
    }
    let mut out = String::with_capacity(end + 16);
    out.push_str(&diff[..end]);
    out.push_str("\n…[truncated]");
    out
}

/// `sha256(session_id|file|title|narrative)[:16]` as lowercase hex.
/// Mirrors the 16-char convention used by `remember_rule` content
/// hashes (skills.rs). Sixty-four bits is plenty for cloud-side
/// dedup inside a per-user corpus.
pub(crate) fn compute_content_hash(
    session_id: &str,
    file_path: Option<&str>,
    title: &str,
    narrative: Option<&str>,
) -> String {
    let mut hasher = Sha256::new();
    hasher.update(session_id.as_bytes());
    hasher.update(b"|");
    hasher.update(file_path.unwrap_or("").as_bytes());
    hasher.update(b"|");
    hasher.update(title.as_bytes());
    hasher.update(b"|");
    hasher.update(narrative.unwrap_or("").as_bytes());
    let digest = hasher.finalize();
    let mut hex = String::with_capacity(16);
    for byte in digest.iter().take(8) {
        hex.push_str(&format!("{byte:02x}"));
    }
    hex
}

/// `true` when the path matches one of the hardcoded secret
/// patterns. Substring match (not full-glob) — good enough to cover
/// `src/config/.env.local` and `infra/prod.credentials.json`.
pub fn is_privacy_denied(path: &str) -> bool {
    let lower = path.to_ascii_lowercase();
    // `.pem`, `.key`, `.secrets` should match as file extensions or
    // embedded tokens; checking lowercase substring handles both.
    // The full list is small and uppercase-insensitive.
    PRIVACY_DENY_SUBSTRINGS
        .iter()
        .any(|needle| lower.contains(needle))
}

fn derive_scope(file_path: &str) -> Option<ObservationScope> {
    let trimmed = file_path.trim_matches('/');
    if trimmed.is_empty() {
        return None;
    }

    let parts: Vec<&str> = trimmed.split('/').filter(|part| !part.is_empty()).collect();
    if parts.is_empty() {
        return None;
    }

    let display_name = parts.last().map(|part| (*part).to_owned());
    let parent_path = if parts.len() > 1 {
        Some(parts[..parts.len() - 1].join("/"))
    } else {
        None
    };

    Some(ObservationScope {
        anchor_kind: "file".to_owned(),
        anchor_key: parts.join("/"),
        parent_path,
        display_name,
    })
}

fn now_unix_ms() -> i64 {
    use std::time::{SystemTime, UNIX_EPOCH};
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .map_or(0, |d| d.as_millis() as i64)
}

#[cfg(test)]
mod tests {
    use super::*;

    fn input<'a>(
        tool: &'a str,
        file: &'a str,
        diff: Option<&'a str>,
        new_text: Option<&'a str>,
        old_text: Option<&'a str>,
    ) -> ClassifyInput<'a> {
        ClassifyInput {
            tool,
            file_path: Some(file),
            diff,
            new_text,
            old_text,
            session_id: Some("sess_test"),
            ts_ms: Some(1_714_000_000_000),
        }
    }

    #[test]
    fn classify_write_new_file_returns_feature() {
        // Claude Code's Write tool ships `content` but no
        // `old_string`. The classifier must tag that shape as a
        // new-file feature, not a generic change.
        let inp = input(
            "Write",
            "src/new_mod.rs",
            Some("+fn hello() {}\n"),
            Some("fn hello() {}\n"),
            None,
        );
        let obs = classify(&inp).expect("some");
        assert_eq!(obs.obs_type, "feature");
        assert_eq!(obs.tool, "Write");
        assert_eq!(obs.file_path.as_deref(), Some("src/new_mod.rs"));
        assert!(
            obs.title.contains("Write"),
            "title missing tool: {}",
            obs.title
        );
    }

    #[test]
    fn classify_edit_removing_fixme_returns_bugfix() {
        // An Edit that drops a visible FIXME comment from the old
        // code must be tagged bugfix — this is the strongest local
        // signal we have that the user just shipped a real fix.
        let old = "// FIXME: panics on None\nfoo.unwrap();\n";
        let new = "if let Some(x) = foo { use_x(x); }\n";
        let diff =
            "-// FIXME: panics on None\n-foo.unwrap();\n+if let Some(x) = foo { use_x(x); }\n";
        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
        let obs = classify(&inp).expect("some");
        assert_eq!(obs.obs_type, "bugfix");
    }

    #[test]
    fn classify_edit_whitespace_only_returns_refactor() {
        // rustfmt-style tweak: same tokens, different whitespace.
        // The classifier must tag these as refactor so the cloud
        // rule-promoter doesn't treat them as semantic change
        // candidates.
        let old = "let x=1;let y=2;";
        let new = "let x = 1;\nlet y = 2;";
        let diff = "-let x=1;let y=2;\n+let x = 1;\n+let y = 2;\n";
        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
        let obs = classify(&inp).expect("some");
        assert_eq!(obs.obs_type, "refactor");
    }

    #[test]
    fn removing_debug_line_does_not_count_as_bug_marker_removal() {
        // Regression: substring matching let "DEBUG" trigger the "BUG"
        // marker — removing a `// DEBUG: …` line silently mislabelled
        // a refactor as a bugfix. Word-boundary counting keeps
        // BUG/DEBUG distinct.
        let old = "// DEBUG: tracing\nlog::trace!(\"x={x}\");\n";
        let new = "// (debug line removed)\n";
        let diff = "-// DEBUG: tracing\n-log::trace!(\"x={x}\");\n+// (debug line removed)\n";
        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
        let obs = classify(&inp).expect("some");
        assert_ne!(
            obs.obs_type, "bugfix",
            "DEBUG → empty must not be classified as a bugfix"
        );
    }

    #[test]
    fn classify_edit_default_returns_change() {
        // A plain-Jane semantic edit with no bug-marker removal and
        // non-trivial content change must fall through to the safe
        // `change` default.
        let old = "let x = 1;";
        let new = "let x = compute_answer();";
        let diff = "-let x = 1;\n+let x = compute_answer();\n";
        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
        let obs = classify(&inp).expect("some");
        assert_eq!(obs.obs_type, "change");
    }

    #[test]
    fn privacy_guard_blocks_env_files() {
        // `.env.local` is the single most-common secret-bearing file
        // an agent will touch. The guard must fire before classify
        // returns so the observation never gets enqueued.
        let inp = input(
            "Write",
            "src/app/.env.local",
            Some("+SECRET=abc\n"),
            Some("SECRET=abc\n"),
            None,
        );
        assert!(classify(&inp).is_none());
    }

    #[test]
    fn privacy_guard_allows_normal_source_files() {
        // Sanity check: ordinary `.rs` paths must NOT be denied.
        let inp = input(
            "Write",
            "src/foo.rs",
            Some("+fn main() {}\n"),
            Some("fn main() {}\n"),
            None,
        );
        assert!(classify(&inp).is_some());
    }

    #[test]
    fn privacy_guard_covers_pem_key_credentials() {
        // Confirm every pattern in the hardcoded list triggers.
        for path in &[
            "config/.env",
            "app.secrets.json",
            "infra/prod.secrets.yaml",
            "keys/server.key",
            "certs/app.pem",
            "home/user/.ssh/id_rsa",
            "credentials.json",
        ] {
            assert!(is_privacy_denied(path), "expected deny for `{path}`");
        }
    }

    #[test]
    fn private_tagged_regions_are_redacted_from_observation_payload() {
        let diff = "-safe\n+safe <private>token=abc</private>\n+done\n";
        let inp = input(
            "Edit",
            "src/foo.rs",
            Some(diff),
            Some("safe done\n"),
            Some("safe\n"),
        );

        let obs = classify(&inp).expect("some");

        assert!(
            obs.narrative
                .as_deref()
                .unwrap()
                .contains("[redacted private content]")
        );
        assert!(
            obs.diff_excerpt
                .as_deref()
                .unwrap()
                .contains("[redacted private content]")
        );
        assert!(!obs.narrative.as_deref().unwrap().contains("token=abc"));
        assert!(!obs.diff_excerpt.as_deref().unwrap().contains("token=abc"));
    }

    #[test]
    fn content_hash_is_stable_and_file_sensitive() {
        // Two observations from identical inputs produce the same hash
        // (idempotent cloud insertion), but a file-path change breaks it.
        let old = "let x = 1;";
        let new = "let x = compute_answer();";
        let diff = "-let x = 1;\n+let x = compute_answer();\n";
        let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
        let a = classify(&inp).expect("some");
        let b = classify(&inp).expect("some");
        assert_eq!(a.content_hash, b.content_hash);
        assert_eq!(a.content_hash.len(), 16);

        let other = classify(&input("Edit", "b.rs", Some(diff), Some(new), Some(old))).unwrap();
        assert_ne!(a.content_hash, other.content_hash);
    }

    #[test]
    fn non_edit_tool_returns_none() {
        // Read / Bash / anything-else-we-don't-recognise must
        // short-circuit. The hook is responsible for filtering
        // too, but defence in depth keeps noise out of the
        // observation stream.
        let inp = input("Read", "src/foo.rs", None, None, None);
        assert!(classify(&inp).is_none());
    }

    #[test]
    fn missing_diff_and_new_text_returns_none() {
        // If the adapter couldn't reconstruct a diff AND didn't
        // give us new_text, there's nothing to classify. We MUST
        // return None rather than emit an observation with empty
        // content.
        let inp = input("Edit", "src/foo.rs", None, None, Some("old"));
        assert!(classify(&inp).is_none());
    }

    #[test]
    fn classify_emits_structured_scope_metadata() {
        let old = "let x = 1;";
        let new = "let x = compute_answer();";
        let diff = "-let x = 1;\n+let x = compute_answer();\n";
        let obs = classify(&input(
            "Edit",
            "src/auth/login/handler.rs",
            Some(diff),
            Some(new),
            Some(old),
        ))
        .expect("some");

        assert_eq!(
            obs.scope,
            Some(ObservationScope {
                anchor_kind: "file".to_owned(),
                anchor_key: "src/auth/login/handler.rs".to_owned(),
                parent_path: Some("src/auth/login".to_owned()),
                display_name: Some("handler.rs".to_owned()),
            })
        );
    }

    #[test]
    fn wire_shape_accepts_optional_scope_metadata() {
        let payload = serde_json::json!({
            "session_id": "sess_new",
            "ts_ms": 2,
            "obs_type": "bugfix",
            "tool": "Edit",
            "file_path": "src/auth/login/handler.rs",
            "scope": {
                "anchor_kind": "file",
                "anchor_key": "src/auth/login/handler.rs",
                "parent_path": "src/auth/login",
                "display_name": "handler.rs"
            },
            "title": "Edit src/auth/login/handler.rs: remove bug marker",
            "narrative": "guard login retry state",
            "diff_excerpt": "-old\n+new",
            "content_hash": "def456"
        });

        let obs: Observation = serde_json::from_value(payload).expect("deserialize");
        assert_eq!(
            obs.scope.as_ref().map(|scope| scope.anchor_key.as_str()),
            Some("src/auth/login/handler.rs")
        );
    }

    /// Ignored-by-default helper that prints a wire-format example for
    /// each `obs_type`. Run with `cargo test -p difflore-core --lib
    /// observation::tests::print_wire_samples -- --ignored --nocapture`.
    /// Emits stable payload shapes without re-deriving them.
    #[test]
    #[ignore = "doc helper for sample wire output, run manually"]
    fn print_wire_samples() {
        let samples = [
            (
                "feature",
                input(
                    "Write",
                    "src/new_mod.rs",
                    Some("+fn hello() {}\n+pub fn world() {}\n"),
                    Some("fn hello() {}\npub fn world() {}\n"),
                    None,
                ),
            ),
            (
                "bugfix",
                input(
                    "Edit",
                    "src/foo.rs",
                    Some(
                        "-// FIXME: crash on None\n-foo.unwrap();\n+if let Some(x) = foo { use_x(x); }\n",
                    ),
                    Some("if let Some(x) = foo { use_x(x); }\n"),
                    Some("// FIXME: crash on None\nfoo.unwrap();\n"),
                ),
            ),
            (
                "refactor",
                input(
                    "Edit",
                    "src/foo.rs",
                    Some("-let x=1;let y=2;\n+let x = 1;\n+let y = 2;\n"),
                    Some("let x = 1;\nlet y = 2;"),
                    Some("let x=1;let y=2;"),
                ),
            ),
            (
                "change",
                input(
                    "Edit",
                    "src/foo.rs",
                    Some("-let x = 1;\n+let x = compute_answer();\n"),
                    Some("let x = compute_answer();"),
                    Some("let x = 1;"),
                ),
            ),
        ];
        for (label, inp) in samples {
            let obs = classify(&inp).expect("some");
            let json = serde_json::to_string_pretty(&obs).unwrap();
            println!("=== {label} ===\n{json}\n");
        }
    }

    #[test]
    fn diff_excerpt_truncates_large_diffs() {
        // Build a 4 KB synthetic diff and confirm the excerpt caps
        // at roughly DIFF_EXCERPT_MAX_BYTES (plus the truncation
        // marker).
        let big: String = (0..4096).map(|_| 'x').collect();
        let diff = format!("-{big}\n+{big}Y\n");
        let inp = input("Edit", "src/foo.rs", Some(&diff), Some("yYY"), Some("xxx"));
        let obs = classify(&inp).expect("some");
        let excerpt = obs.diff_excerpt.expect("excerpt present");
        assert!(
            excerpt.len() <= DIFF_EXCERPT_MAX_BYTES + 32,
            "excerpt too long: {}",
            excerpt.len()
        );
        assert!(excerpt.ends_with("[truncated]"));
    }
}