spool-memory 0.2.3

//! Extraction heuristic — conservative behavior-pattern miner.
//!
//! ## Philosophy
//! `candidate` records show up in `memory_review_queue` and the user
//! has to triage them. False positives cost user attention and erode
//! trust in spool ("more noise than signal"). We deliberately err on
//! the side of **under-detection**: only emit a candidate when we
//! have reasonable confidence the pattern is real.
//!
//! ## R3c scope: incident detection
//! We detect one pattern: **repeated frustration markers** within a
//! single session's user-authored transcript entries.
//!
//! Triggering condition:
//! 1. A user message contains a "frustration phrase" (Chinese:
//!    "错了" / "不对" / "失败"; English: "wrong" / "not working" /
//!    "still broken" / "doesn't work" / "didn't work" / "broke
//!    again" — see [`FRUSTRATION_PHRASES`]).
//! 2. AT LEAST ONE OTHER user message in the same session also
//!    contains a frustration phrase.
//! 3. The two messages are within a sliding window of
//!    [`INCIDENT_WINDOW`] consecutive user messages.
//!
//! When triggered, we emit a single [`ExtractionSignal`] of kind
//! [`ExtractionKind::Incident`] whose summary is the **last** matched
//! user message (truncated to 1000 chars). Multiple incidents per
//! session are fine but each emits a single signal — we don't spam
//! the review queue with duplicate frustration entries.
//!
//! ## What we deliberately don't do (yet)
//! - Behavioral pattern mining over the distill queue (tool-use
//!   counts). Too noisy without ranking.
//! - Commit-style change detection. Requires git introspection
//!   coupling that's out of scope for R3.
//! - Cross-session aggregation. R4 sampling will handle that.

use serde::{Deserialize, Serialize};

/// How many user messages back we look when verifying repeated
/// frustration. Smaller window = more conservative.
pub const INCIDENT_WINDOW: usize = 6;

/// Cap on the summary content (mirrors [`super::self_tag::MAX_CONTENT_CHARS`]).
pub const MAX_SUMMARY_CHARS: usize = 1000;

/// One extraction-derived candidate signal.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct ExtractionSignal {
    pub kind: ExtractionKind,
    pub summary: String,
    /// Indices into the input slice where the supporting evidence
    /// lives. Used by tests + future audit logs.
    pub evidence_indices: Vec<usize>,
}

#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ExtractionKind {
    /// Repeated frustration / error markers within the session.
    Incident,
    /// Recurring behavior the user wants to standardize. Currently
    /// only emitted by sampling-derived candidates (R4b-2); Tier 1
    /// heuristics never produce this kind directly.
    BehaviorPattern,
    /// User-stated decision. Currently only emitted by
    /// sampling-derived candidates; Tier 1 self-tag handles explicit
    /// decisions like "决定 X" / "以后都 Y".
    Decision,
}

impl ExtractionKind {
    pub fn memory_type(self) -> &'static str {
        match self {
            ExtractionKind::Incident => "incident",
            ExtractionKind::BehaviorPattern => "behavior_pattern",
            ExtractionKind::Decision => "decision",
        }
    }
}

/// Frustration phrases we treat as evidence of an incident. Order
/// doesn't matter; matches are case-insensitive for English.
const FRUSTRATION_PHRASES: &[&str] = &[
    // Chinese
    "错了",
    "不对",
    "不行",
    "失败了",
    "又失败",
    "不工作",
    "崩了",
    "出错了",
    // English (case-insensitive search)
    "wrong",
    "not working",
    "doesn't work",
    "didn't work",
    "still broken",
    "broke again",
    "still not",
    "not fixed",
];

/// Run extraction over a slice of user messages (in chronological
/// order). Returns at most one signal per detected incident cluster.
pub fn detect(user_messages: &[&str]) -> Vec<ExtractionSignal> {
    if user_messages.len() < 2 {
        return Vec::new();
    }
    let hits: Vec<usize> = user_messages
        .iter()
        .enumerate()
        .filter(|(_, msg)| contains_frustration(msg))
        .map(|(idx, _)| idx)
        .collect();
    if hits.len() < 2 {
        return Vec::new();
    }

    // Greedy cluster: walk the hits left-to-right; for each cluster,
    // include every hit that falls within INCIDENT_WINDOW user
    // messages of the cluster's first hit. Emit one signal per
    // cluster — a single incident produces a single review item, no
    // matter how many frustration phrases the user typed.
    let mut signals = Vec::new();
    let mut i = 0;
    while i < hits.len() {
        let start = hits[i];
        let window_end_exclusive = start + INCIDENT_WINDOW;
        let cluster: Vec<usize> = hits[i..]
            .iter()
            .copied()
            .take_while(|h| *h < window_end_exclusive)
            .collect();
        if cluster.len() < 2 {
            i += 1;
            continue;
        }
        let last = *cluster.last().expect("non-empty cluster");
        let summary = cap_chars(user_messages[last].trim(), MAX_SUMMARY_CHARS);
        if !summary.is_empty() {
            signals.push(ExtractionSignal {
                kind: ExtractionKind::Incident,
                summary,
                evidence_indices: cluster.clone(),
            });
        }
        i += cluster.len();
    }
    signals
}

fn contains_frustration(msg: &str) -> bool {
    if msg.is_empty() {
        return false;
    }
    let lower = msg.to_lowercase();
    for phrase in FRUSTRATION_PHRASES {
        if phrase.is_ascii() {
            // Case-insensitive ASCII match.
            if lower.contains(*phrase) {
                return true;
            }
        } else {
            // Chinese phrases — exact substring (chars are
            // multi-byte but `contains` works on byte sequences and
            // is safe for valid UTF-8).
            if msg.contains(*phrase) {
                return true;
            }
        }
    }
    false
}

fn cap_chars(s: &str, max_chars: usize) -> String {
    if s.chars().count() <= max_chars {
        return s.to_string();
    }
    let mut out = String::with_capacity(s.len());
    for (i, ch) in s.chars().enumerate() {
        if i >= max_chars {
            break;
        }
        out.push(ch);
    }
    out.push('…');
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn detect_returns_empty_for_short_session() {
        assert!(detect(&[]).is_empty());
        assert!(detect(&["just one message"]).is_empty());
    }

    #[test]
    fn detect_returns_empty_when_no_frustration() {
        let msgs = ["normal", "everything fine", "all good"];
        let refs: Vec<&str> = msgs.to_vec();
        assert!(detect(&refs).is_empty());
    }

    #[test]
    fn detect_returns_empty_for_single_frustration_hit() {
        // One miss + one hit = not enough to confirm an incident.
        let msgs = ["normal", "this still not working"];
        let refs: Vec<&str> = msgs.to_vec();
        assert!(detect(&refs).is_empty());
    }

    #[test]
    fn detect_emits_signal_for_two_chinese_frustration_hits() {
        let msgs = ["试一下", "还是错了", "看看日志", "又失败了"];
        let refs: Vec<&str> = msgs.to_vec();
        let signals = detect(&refs);
        assert_eq!(signals.len(), 1);
        assert_eq!(signals[0].kind, ExtractionKind::Incident);
        assert_eq!(signals[0].summary, "又失败了");
        assert_eq!(signals[0].evidence_indices, vec![1, 3]);
    }

    #[test]
    fn detect_emits_signal_for_two_english_frustration_hits() {
        let msgs = [
            "let me try this",
            "ugh, that's wrong",
            "let me check logs",
            "still not working",
        ];
        let refs: Vec<&str> = msgs.to_vec();
        let signals = detect(&refs);
        assert_eq!(signals.len(), 1);
        assert_eq!(signals[0].summary, "still not working");
    }

    #[test]
    fn detect_skips_when_hits_outside_sliding_window() {
        // Hits 7 messages apart with INCIDENT_WINDOW=6 → no signal.
        let mut msgs = vec!["wrong"];
        msgs.extend(std::iter::repeat_n("filler", INCIDENT_WINDOW));
        msgs.push("still not");
        let refs: Vec<&str> = msgs.to_vec();
        let signals = detect(&refs);
        assert!(
            signals.is_empty(),
            "hits across {} msgs should not emit",
            INCIDENT_WINDOW + 2
        );
    }

    #[test]
    fn detect_collapses_one_incident_per_overlapping_window() {
        // Three frustration hits in a row → one signal (we don't
        // spam the queue).
        let msgs = ["wrong", "still wrong", "broke again"];
        let refs: Vec<&str> = msgs.to_vec();
        let signals = detect(&refs);
        assert_eq!(signals.len(), 1);
        assert_eq!(signals[0].evidence_indices.len(), 3);
        // Summary is the LAST hit.
        assert_eq!(signals[0].summary, "broke again");
    }

    #[test]
    fn detect_caps_summary_when_user_message_is_huge() {
        let mut huge = String::from("wrong: ");
        huge.push_str(&"x".repeat(MAX_SUMMARY_CHARS * 2));
        let msgs = ["first wrong attempt", huge.as_str()];
        let refs: Vec<&str> = msgs.to_vec();
        let signals = detect(&refs);
        assert_eq!(signals.len(), 1);
        let chars = signals[0].summary.chars().count();
        assert!(chars <= MAX_SUMMARY_CHARS + 1);
        assert!(signals[0].summary.ends_with('…'));
    }

    #[test]
    fn detect_case_insensitive_for_english_phrases() {
        let msgs = ["WRONG.", "Still NOT working"];
        let refs: Vec<&str> = msgs.to_vec();
        let signals = detect(&refs);
        assert_eq!(signals.len(), 1);
    }

    #[test]
    fn detect_handles_mixed_chinese_english_hits() {
        let msgs = ["错了", "filler", "wrong"];
        let refs: Vec<&str> = msgs.to_vec();
        let signals = detect(&refs);
        assert_eq!(signals.len(), 1);
    }

    #[test]
    fn extraction_kind_memory_type_is_incident() {
        assert_eq!(ExtractionKind::Incident.memory_type(), "incident");
    }
}