Skip to main content

nexus_memory_hooks/
candidate.rs

1//! High-signal memory candidate derivation from normalized hook events
2//!
3//! Scores events and derives MemoryCandidates only when there is
4//! possible retrieval value. Implements duplicate suppression.
5
6use serde::{Deserialize, Serialize};
7use serde_json::Value;
8use sha2::{Digest, Sha256};
9use std::collections::HashSet;
10
11use crate::claude_payload::NormalizedHookEvent;
12
13/// Truncate a string to at most `max_chars` characters, appending "..." if truncated.
14///
15/// Operates on `char` boundaries (not bytes) so it is safe for multi-byte UTF-8.
16pub(crate) fn truncate_str(s: &str, max_chars: usize) -> String {
17    if s.chars().count() <= max_chars {
18        s.to_string()
19    } else {
20        let truncated: String = s.chars().take(max_chars).collect();
21        format!("{}...", truncated)
22    }
23}
24
25/// A candidate memory derived from a hook event, pending LLM enrichment.
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct MemoryCandidate {
28    pub candidate_id: String,
29    pub source_event_name: String,
30    pub source_agent: String,
31    pub signal_score: f32,
32    pub provisional_category: Option<String>,
33    pub memory_text: String,
34    pub evidence: Value,
35    pub labels: Vec<String>,
36}
37
38/// Simple bash commands that indicate low-signal noise.
39const SIMPLE_BASH_COMMANDS: &[&str] = &["ls", "pwd", "whoami", "date", "uptime", "echo"];
40
41/// Patterns that indicate high-signal content in tool responses.
42const HIGH_SIGNAL_PATTERNS: &[&str] = &[
43    "test result:",
44    "passed",
45    "failed",
46    "error:",
47    "warning:",
48    "version",
49    "/",
50    ".",
51    "compilation",
52    "build",
53];
54
55/// Derive memory candidates from a normalized hook event.
56///
57/// Returns a vector of candidates (typically 0-1). Returns empty if:
58/// - The event has no extractable content
59/// - The signal score is below threshold (0.3)
60/// - The event is a duplicate (based on fingerprint)
61pub fn derive_candidates(
62    event: &NormalizedHookEvent,
63    seen_fingerprints: &mut HashSet<String>,
64) -> Vec<MemoryCandidate> {
65    // Start with zero signal
66    let mut signal_score = 0.0f32;
67
68    // Bash-specific scoring - check before base score for simple commands
69    let is_low_signal_bash = if event.tool_name.as_deref() == Some("Bash") {
70        let command = event
71            .tool_input
72            .as_ref()
73            .and_then(|v| v.get("command"))
74            .and_then(|c| c.as_str())
75            .unwrap_or("");
76
77        let is_simple = SIMPLE_BASH_COMMANDS
78            .iter()
79            .any(|&simple| command.trim().starts_with(simple));
80
81        if is_simple {
82            // Simple bash commands get very low signal
83            signal_score += 0.1;
84            true
85        } else if let Some(response) = &event.tool_response_text {
86            // Check for high-signal patterns in response
87            let pattern_matches = HIGH_SIGNAL_PATTERNS
88                .iter()
89                .filter(|&&pattern| response.to_lowercase().contains(pattern))
90                .count();
91            // Score proportional to number of high-signal patterns found
92            signal_score += 0.2 + (pattern_matches as f32 * 0.1).min(0.4);
93            false
94        } else {
95            false
96        }
97    } else {
98        false
99    };
100
101    // Check for tool input with content (unless it's a low-signal bash command)
102    if !is_low_signal_bash {
103        let has_tool_input = event
104            .tool_input
105            .as_ref()
106            .map(|v| !v.is_null() && !v.as_object().map(|o| o.is_empty()).unwrap_or(false))
107            .unwrap_or(false);
108
109        if event.tool_name.is_some() && has_tool_input {
110            signal_score += 0.3;
111        }
112
113        // File operations (Read, Write, Edit) are inherently high-signal
114        if let Some(tool) = &event.tool_name {
115            let tool_lower = tool.to_lowercase();
116            if tool_lower == "read"
117                || tool_lower == "write"
118                || tool_lower == "edit"
119                || tool_lower == "multi_edit"
120                || tool_lower == "glob"
121                || tool_lower == "grep"
122            {
123                signal_score += 0.2;
124            }
125        }
126    }
127
128    // Assistant message contributes (reduced minimum threshold, reward length)
129    if let Some(msg) = &event.assistant_message_text {
130        if msg.len() > 50 {
131            signal_score += 0.3;
132        } else if msg.len() > 20 {
133            signal_score += 0.15;
134        }
135    }
136
137    // User message contributes (reduced minimum threshold, reward length)
138    if let Some(msg) = &event.user_message_text {
139        if msg.len() > 100 {
140            signal_score += 0.35;
141        } else if msg.len() > 20 {
142            signal_score += 0.2;
143        }
144    }
145
146    // User prompt submit is high signal only if it has actual content
147    if event.event_name == "user-prompt-submit" && event.user_message_text.is_some() {
148        signal_score += 0.3;
149    }
150
151    // Plan and review events are high signal
152    let event_lower = event.event_name.to_lowercase();
153    if event_lower.contains("plan") || event_lower.contains("review") {
154        signal_score += 0.2;
155    }
156
157    // Error/failure events are always high signal
158    if event_lower.contains("error")
159        || event_lower.contains("fail")
160        || event_lower.contains("crash")
161    {
162        signal_score += 0.25;
163    }
164
165    // Build/compile/test events are high signal
166    if event_lower.contains("build")
167        || event_lower.contains("test")
168        || event_lower.contains("compile")
169    {
170        signal_score += 0.2;
171    }
172
173    // Require meaningful content, not just metadata presence
174    let has_meaningful_tool_input = event.tool_input.as_ref().is_some_and(|v| {
175        !v.is_null() && !v.as_object().is_some_and(|o| o.is_empty()) && v.to_string().len() > 5
176    });
177    let has_any_content = has_meaningful_tool_input
178        || event
179            .tool_response_text
180            .as_ref()
181            .is_some_and(|s| s.len() > 10)
182        || event
183            .assistant_message_text
184            .as_ref()
185            .is_some_and(|s| s.len() > 20)
186        || event
187            .user_message_text
188            .as_ref()
189            .is_some_and(|s| s.len() > 10);
190
191    if !has_any_content {
192        return Vec::new();
193    }
194
195    // Skip if signal score is too low (reduced from 0.4 to 0.3 to capture more useful events)
196    if signal_score < 0.3 {
197        return Vec::new();
198    }
199
200    // Build fingerprint for duplicate suppression
201    let tool_input_hash = if let Some(input) = &event.tool_input {
202        let mut hasher = Sha256::new();
203        hasher.update(input.to_string().as_bytes());
204        format!("{:x}", hasher.finalize())
205    } else {
206        String::new()
207    };
208
209    let fingerprint = format!(
210        "{}|{}|{}|{}",
211        event.session_id.as_deref().unwrap_or(""),
212        event.event_name,
213        event.tool_name.as_deref().unwrap_or(""),
214        tool_input_hash
215    );
216
217    if seen_fingerprints.contains(&fingerprint) {
218        return Vec::new(); // Duplicate
219    }
220
221    seen_fingerprints.insert(fingerprint);
222
223    // Derive memory text based on event type
224    let memory_text = derive_memory_text(event);
225
226    // Build evidence JSON
227    let evidence = build_evidence(event);
228
229    // Derive labels
230    let labels = derive_labels(event, signal_score);
231
232    // Determine provisional category
233    let provisional_category = derive_provisional_category(event, signal_score);
234
235    let candidate_id = uuid::Uuid::new_v4().to_string();
236
237    vec![MemoryCandidate {
238        candidate_id,
239        source_event_name: event.event_name.clone(),
240        source_agent: event.agent.clone(),
241        signal_score,
242        provisional_category,
243        memory_text,
244        evidence,
245        labels,
246    }]
247}
248
249/// Derive memory text based on event type and content.
250fn derive_memory_text(event: &NormalizedHookEvent) -> String {
251    // Bash tool event
252    if event.tool_name.as_deref() == Some("Bash") {
253        let command = event
254            .tool_input
255            .as_ref()
256            .and_then(|v| v.get("command"))
257            .and_then(|c| c.as_str())
258            .unwrap_or("");
259
260        let excerpt = event
261            .tool_response_text
262            .as_ref()
263            .map(|s| truncate_str(s, 12000))
264            .unwrap_or_default();
265
266        if !excerpt.is_empty() {
267            return format!("Ran `{}` → {}", command, excerpt);
268        }
269    }
270
271    // User prompt submit
272    if event.event_name == "user-prompt-submit" {
273        if let Some(msg) = &event.user_message_text {
274            return msg.clone();
275        }
276    }
277
278    // Plan/review events
279    let event_lower = event.event_name.to_lowercase();
280    if event_lower.contains("plan") || event_lower.contains("review") {
281        if let Some(input) = &event.tool_input {
282            if let Some(plan) = input.get("plan").and_then(|p| p.as_str()) {
283                return format!("Plan: {}", plan);
284            }
285        }
286        if let Some(name) = &event.tool_name {
287            return format!("Plan: {}", name);
288        }
289    }
290
291    // Assistant messages with decisions
292    if let Some(msg) = &event.assistant_message_text {
293        if msg.to_lowercase().contains("decision")
294            || msg.to_lowercase().contains("will")
295            || msg.to_lowercase().contains("going to")
296        {
297            let excerpt = truncate_str(msg, 12000);
298            return format!("Decision: {}", excerpt);
299        }
300    }
301
302    // Fallback: concatenate available text
303    let parts: Vec<&str> = vec![
304        event.tool_response_text.as_deref(),
305        event.assistant_message_text.as_deref(),
306        event.user_message_text.as_deref(),
307    ]
308    .into_iter()
309    .flatten()
310    .collect();
311
312    if parts.is_empty() {
313        format!("Event: {}", event.event_name)
314    } else {
315        parts.join(" | ")
316    }
317}
318
319/// Build evidence JSON for the candidate.
320fn build_evidence(event: &NormalizedHookEvent) -> Value {
321    let mut evidence = serde_json::Map::new();
322
323    if let Some(name) = &event.tool_name {
324        evidence.insert("tool_name".to_string(), Value::String(name.clone()));
325    }
326
327    if let Some(input) = &event.tool_input {
328        evidence.insert("tool_input".to_string(), input.clone());
329    }
330
331    if let Some(response) = &event.tool_response_text {
332        let excerpt = truncate_str(response, 200);
333        evidence.insert("tool_response_excerpt".to_string(), Value::String(excerpt));
334    }
335
336    if let Some(msg) = &event.assistant_message_text {
337        let excerpt = truncate_str(msg, 200);
338        evidence.insert(
339            "assistant_message_excerpt".to_string(),
340            Value::String(excerpt),
341        );
342    }
343
344    if let Some(msg) = &event.user_message_text {
345        let excerpt = truncate_str(msg, 200);
346        evidence.insert("user_message_excerpt".to_string(), Value::String(excerpt));
347    }
348
349    Value::Object(evidence)
350}
351
352/// Derive labels based on event content.
353fn derive_labels(event: &NormalizedHookEvent, signal_score: f32) -> Vec<String> {
354    let mut labels = Vec::new();
355
356    // Signal level label
357    if signal_score >= 0.7 {
358        labels.push("high-signal".to_string());
359    } else if signal_score >= 0.5 {
360        labels.push("medium-signal".to_string());
361    }
362
363    // Tool-based labels
364    if let Some(name) = &event.tool_name {
365        labels.push(format!("tool:{}", name.to_lowercase()));
366    }
367
368    // Event type labels
369    let event_lower = event.event_name.to_lowercase();
370    if event_lower.contains("plan") {
371        labels.push("plan".to_string());
372    }
373    if event_lower.contains("review") {
374        labels.push("review".to_string());
375    }
376    if event_lower.contains("error") {
377        labels.push("error".to_string());
378    }
379
380    // Verification/testing labels
381    if event_lower.contains("test") || event_lower.contains("verify") {
382        labels.push("verification".to_string());
383    }
384
385    labels
386}
387
388/// Derive provisional category based on event characteristics.
389fn derive_provisional_category(event: &NormalizedHookEvent, signal_score: f32) -> Option<String> {
390    let event_lower = event.event_name.to_lowercase();
391
392    // User preferences
393    if event_lower.contains("user-prompt") {
394        if let Some(msg) = &event.user_message_text {
395            if msg.to_lowercase().contains("prefer")
396                || msg.to_lowercase().contains("always")
397                || msg.to_lowercase().contains("never")
398            {
399                return Some("preferences".to_string());
400            }
401        }
402    }
403
404    // Context/decisions from assistant
405    if event_lower.contains("plan") || event_lower.contains("review") {
406        return Some("context".to_string());
407    }
408
409    // Facts from verification
410    if (event_lower.contains("test") || event_lower.contains("verify")) && signal_score > 0.6 {
411        return Some("facts".to_string());
412    }
413
414    // Bash output often contains facts
415    if event.tool_name.as_deref() == Some("Bash") {
416        if let Some(response) = &event.tool_response_text {
417            if response.contains("test result:")
418                || response.contains("passed")
419                || response.contains("failed")
420            {
421                return Some("facts".to_string());
422            }
423        }
424    }
425
426    None
427}
428
429#[cfg(test)]
430mod tests {
431    use super::*;
432    use crate::claude_payload::normalize_claude_payload;
433    use serde_json::json;
434
435    #[test]
436    fn test_noise_event_yields_no_candidates() {
437        let raw = json!({
438            "tool_name": "Bash",
439            "tool_input": {"command": "ls"},
440            "tool_response": "file1.txt\nfile2.txt"
441        });
442
443        let event = normalize_claude_payload("claude-code", "post-tool-use", &raw);
444        let mut seen = HashSet::new();
445        let candidates = derive_candidates(&event, &mut seen);
446
447        // Simple ls command should have low signal (<0.4)
448        assert!(candidates.is_empty());
449    }
450
451    #[test]
452    fn test_bash_verification_event_yields_candidate() {
453        let raw = json!({
454            "tool_name": "Bash",
455            "tool_input": {"command": "cargo test"},
456            "tool_response": "running 12 tests\ntest result: ok. 12 passed; 0 failed",
457            "session_id": "sess-123"
458        });
459
460        let event = normalize_claude_payload("claude-code", "post-tool-use", &raw);
461        let mut seen = HashSet::new();
462        let candidates = derive_candidates(&event, &mut seen);
463
464        assert_eq!(candidates.len(), 1);
465        let candidate = &candidates[0];
466        assert!(candidate.signal_score >= 0.4);
467        assert!(candidate.memory_text.contains("Ran"));
468        assert!(candidate.labels.iter().any(|l| l == "tool:bash"));
469    }
470
471    #[test]
472    fn test_user_preference_prompt_yields_candidate() {
473        let raw = json!({
474            "event_name": "user-prompt-submit",
475            "user_message": "I always prefer to use rustfmt with a 4-space indent. Please configure this for all my projects."
476        });
477
478        let event = normalize_claude_payload("claude-code", "user-prompt-submit", &raw);
479        let mut seen = HashSet::new();
480        let candidates = derive_candidates(&event, &mut seen);
481
482        assert_eq!(candidates.len(), 1);
483        let candidate = &candidates[0];
484        assert!(candidate.signal_score >= 0.5);
485        assert_eq!(
486            candidate.provisional_category,
487            Some("preferences".to_string())
488        );
489        assert!(candidate.memory_text.contains("prefer"));
490    }
491
492    #[test]
493    fn test_duplicate_suppression_works() {
494        let raw = json!({
495            "tool_name": "Bash",
496            "tool_input": {"command": "cargo test"},
497            "tool_response": "test result: ok",
498            "session_id": "sess-456"
499        });
500
501        let event = normalize_claude_payload("claude-code", "post-tool-use", &raw);
502        let mut seen = HashSet::new();
503
504        let first = derive_candidates(&event, &mut seen);
505        assert_eq!(first.len(), 1);
506
507        let second = derive_candidates(&event, &mut seen);
508        assert_eq!(second.len(), 0); // Duplicate suppressed
509    }
510
511    #[test]
512    fn test_plan_event_yields_candidate() {
513        let raw = json!({
514            "event_name": "plan-review",
515            "tool_name": "Plan",
516            "tool_input": {"plan": "Implement feature X, then test"}
517        });
518
519        let event = normalize_claude_payload("claude-code", "plan-review", &raw);
520        let mut seen = HashSet::new();
521        let candidates = derive_candidates(&event, &mut seen);
522
523        assert_eq!(candidates.len(), 1);
524        let candidate = &candidates[0];
525        assert!(candidate.signal_score >= 0.3);
526        assert!(candidate.labels.contains(&"plan".to_string()));
527        assert_eq!(candidate.provisional_category, Some("context".to_string()));
528    }
529
530    #[test]
531    fn test_empty_event_yields_no_candidates() {
532        let raw = json!({});
533
534        let event = normalize_claude_payload("claude-code", "empty", &raw);
535        let mut seen = HashSet::new();
536        let candidates = derive_candidates(&event, &mut seen);
537
538        assert!(candidates.is_empty());
539    }
540
541    #[test]
542    fn test_high_signal_label_added() {
543        let raw = json!({
544            "event_name": "user-prompt-submit",
545            "user_message": "I always prefer using tabs over spaces in my code. This is a strong preference that applies to all languages.",
546            "assistant_message": "I'll configure your editor to use tabs by default for all file types."
547        });
548
549        let event = normalize_claude_payload("claude-code", "user-prompt-submit", &raw);
550        let mut seen = HashSet::new();
551        let candidates = derive_candidates(&event, &mut seen);
552
553        assert_eq!(candidates.len(), 1);
554        let candidate = &candidates[0];
555        assert!(candidate.signal_score >= 0.7);
556        assert!(candidate.labels.contains(&"high-signal".to_string()));
557    }
558
559    #[test]
560    fn test_evidence_construction() {
561        let raw = json!({
562            "tool_name": "Read",
563            "tool_input": {"file_path": "src/main.rs"},
564            "tool_response": "This is a very long response that should be truncated in the evidence because it exceeds the maximum character limit for excerpts.",
565            "assistant_message": "The file contains the main function with error handling."
566        });
567
568        let event = normalize_claude_payload("claude-code", "post-tool-use", &raw);
569        let mut seen = HashSet::new();
570        let candidates = derive_candidates(&event, &mut seen);
571
572        assert_eq!(candidates.len(), 1);
573        let candidate = &candidates[0];
574        assert!(candidate.evidence.get("tool_name").is_some());
575        assert!(candidate.evidence.get("tool_input").is_some());
576
577        let excerpt = candidate
578            .evidence
579            .get("tool_response_excerpt")
580            .and_then(|v| v.as_str());
581        assert!(excerpt.is_some());
582        assert!(excerpt.unwrap().len() <= 203); // 200 + "..."
583    }
584
585    #[test]
586    fn test_truncate_utf8_multibyte() {
587        // Japanese characters are 3 bytes each in UTF-8
588        let s = "日本語テスト文字列";
589        assert_eq!(truncate_str(s, 100), s);
590        let truncated = truncate_str(s, 4);
591        assert_eq!(truncated, "日本語テ...");
592        // Verify it's valid UTF-8
593        assert!(std::str::from_utf8(truncated.as_bytes()).is_ok());
594    }
595
596    #[test]
597    fn test_truncate_mixed_ascii_multibyte() {
598        let s = "Hello日本語World";
599        assert_eq!(truncate_str(s, 100), s);
600        let truncated = truncate_str(s, 7);
601        assert_eq!(truncated, "Hello日本...");
602        assert!(std::str::from_utf8(truncated.as_bytes()).is_ok());
603    }
604
605    #[test]
606    fn test_truncate_empty_and_short() {
607        assert_eq!(truncate_str("", 10), "");
608        assert_eq!(truncate_str("hi", 10), "hi");
609        assert_eq!(truncate_str("hello", 5), "hello");
610    }
611
612    #[test]
613    fn test_truncate_exact_boundary() {
614        let s = "abcdefghij";
615        assert_eq!(truncate_str(s, 10), s); // exactly at limit, no "..."
616        let longer = "abcdefghijklmno";
617        assert_eq!(truncate_str(longer, 10), "abcdefghij...");
618    }
619
620    #[test]
621    fn test_different_sessions_different_fingerprints() {
622        let raw1 = json!({
623            "tool_name": "Bash",
624            "tool_input": {"command": "echo test"},
625            "tool_response": "test",
626            "session_id": "sess-A"
627        });
628
629        let raw2 = json!({
630            "tool_name": "Bash",
631            "tool_input": {"command": "echo test"},
632            "tool_response": "test",
633            "session_id": "sess-B"
634        });
635
636        let event1 = normalize_claude_payload("claude-code", "post-tool-use", &raw1);
637        let event2 = normalize_claude_payload("claude-code", "post-tool-use", &raw2);
638
639        let mut seen = HashSet::new();
640        let candidates1 = derive_candidates(&event1, &mut seen);
641        let candidates2 = derive_candidates(&event2, &mut seen);
642
643        // Different sessions should not be considered duplicates
644        // (though simple echo may still be filtered by low signal)
645        let total: usize = candidates1.len() + candidates2.len();
646        assert!(total <= 2);
647    }
648}