Skip to main content

spool/
enrich.rs

1//! Heuristic enrichment for lifecycle records.
2//!
3//! Extracts structured fields (entities, tags, triggers) from a record's
4//! title and summary using rule-based heuristics. Used by `sync-vault --enrich`
5//! to backfill older records that lack these fields.
6
7use crate::domain::MemoryRecord;
8
9/// Result of enrichment analysis for a single record.
10#[derive(Debug, Clone, Default)]
11pub struct EnrichmentPatch {
12    pub entities: Vec<String>,
13    pub tags: Vec<String>,
14    pub triggers: Vec<String>,
15}
16
17impl EnrichmentPatch {
18    pub fn is_empty(&self) -> bool {
19        self.entities.is_empty() && self.tags.is_empty() && self.triggers.is_empty()
20    }
21}
22
23/// Known tool/library/technology names for entity extraction.
24const KNOWN_TOOLS: &[&str] = &[
25    "Rust",
26    "TypeScript",
27    "JavaScript",
28    "Python",
29    "Go",
30    "Java",
31    "Swift",
32    "React",
33    "Vue",
34    "Svelte",
35    "Angular",
36    "Next.js",
37    "Nuxt",
38    "SQLite",
39    "PostgreSQL",
40    "MySQL",
41    "Redis",
42    "MongoDB",
43    "Docker",
44    "Kubernetes",
45    "Terraform",
46    "AWS",
47    "GCP",
48    "Azure",
49    "Git",
50    "GitHub",
51    "GitLab",
52    "Obsidian",
53    "Tauri",
54    "Electron",
55    "tantivy",
56    "tokio",
57    "serde",
58    "clap",
59    "anyhow",
60    "npm",
61    "cargo",
62    "pip",
63    "brew",
64    "MCP",
65    "JSON-RPC",
66    "REST",
67    "GraphQL",
68    "gRPC",
69    "OAuth",
70    "JWT",
71    "CSRF",
72    "XSS",
73    "CORS",
74    "CI",
75    "CD",
76    "TDD",
77    "BDD",
78];
79
80/// Stop words to skip when extracting triggers from titles.
81const STOP_WORDS: &[&str] = &[
82    "the", "a", "an", "is", "are", "was", "were", "be", "been", "to", "of", "in", "for", "on",
83    "with", "at", "by", "from", "and", "or", "not", "no", "but", "if", "then", "else", "this",
84    "that", "it", "its", "my", "your", "our", "do", "does", "did", "will", "would", "should",
85    "could", "have", "has", "had", "can", "may", "might", "use", "using", "used",
86];
87
88/// Tag inference rules: keyword patterns mapped to tag names.
89const TAG_RULES: &[(&[&str], &str)] = &[
90    (
91        &[
92            "database", "db", "sql", "postgres", "sqlite", "mysql", "redis", "mongo",
93        ],
94        "database",
95    ),
96    (
97        &["test", "spec", "assert", "mock", "tdd", "coverage"],
98        "testing",
99    ),
100    (
101        &["auth", "oauth", "jwt", "login", "session", "permission"],
102        "auth",
103    ),
104    (
105        &["api", "endpoint", "route", "rest", "graphql", "grpc"],
106        "api",
107    ),
108    (
109        &["deploy", "ci", "cd", "pipeline", "docker", "kubernetes"],
110        "devops",
111    ),
112    (
113        &["security", "csrf", "xss", "cors", "vulnerability", "secret"],
114        "security",
115    ),
116    (
117        &["performance", "cache", "optimize", "latency", "throughput"],
118        "performance",
119    ),
120    (
121        &["ui", "frontend", "component", "layout", "style", "css"],
122        "frontend",
123    ),
124    (
125        &["config", "setting", "environment", "env", "toml", "yaml"],
126        "config",
127    ),
128    (
129        &["error", "exception", "panic", "crash", "bug", "fix"],
130        "error-handling",
131    ),
132    (
133        &["refactor", "cleanup", "rename", "restructure", "simplify"],
134        "refactoring",
135    ),
136    (
137        &["document", "readme", "comment", "doc", "guide"],
138        "documentation",
139    ),
140];
141
142/// Analyze a record and produce an enrichment patch with entities, tags, and triggers.
143/// Only returns fields that the record currently lacks (empty vectors).
144pub fn enrich_record(record: &MemoryRecord) -> EnrichmentPatch {
145    let text = format!("{} {}", record.title, record.summary);
146    let text_lower = text.to_lowercase();
147
148    let entities = if record.entities.is_empty() {
149        extract_entities(&text)
150    } else {
151        Vec::new()
152    };
153
154    let tags = if record.tags.is_empty() {
155        extract_tags(&text_lower, &record.memory_type)
156    } else {
157        Vec::new()
158    };
159
160    let triggers = if record.triggers.is_empty() {
161        extract_triggers(&record.title)
162    } else {
163        Vec::new()
164    };
165
166    EnrichmentPatch {
167        entities,
168        tags,
169        triggers,
170    }
171}
172
173/// Extract entities from text: known tools/libraries and capitalized multi-char words.
174fn extract_entities(text: &str) -> Vec<String> {
175    let mut entities = Vec::new();
176    let mut seen = std::collections::HashSet::new();
177
178    // Check for known tools (case-insensitive match, preserve canonical casing)
179    let text_lower = text.to_lowercase();
180    for tool in KNOWN_TOOLS {
181        let tool_lower = tool.to_lowercase();
182        if text_lower.contains(&tool_lower) && seen.insert(tool_lower) {
183            entities.push(tool.to_string());
184        }
185    }
186
187    // Extract capitalized words that look like proper nouns (>= 2 chars, starts uppercase)
188    for word in text.split_whitespace() {
189        let cleaned = word.trim_matches(|c: char| !c.is_alphanumeric());
190        if cleaned.len() >= 2
191            && cleaned
192                .chars()
193                .next()
194                .map(|c| c.is_uppercase())
195                .unwrap_or(false)
196            && !cleaned
197                .chars()
198                .all(|c| c.is_uppercase() || !c.is_alphabetic())
199            && !STOP_WORDS.contains(&cleaned.to_lowercase().as_str())
200        {
201            let lower = cleaned.to_lowercase();
202            // Skip if already captured as a known tool
203            if seen.insert(lower) {
204                entities.push(cleaned.to_string());
205            }
206        }
207    }
208
209    // Cap at 5 entities
210    entities.truncate(5);
211    entities
212}
213
214/// Extract tags based on keyword matching and memory_type mapping.
215fn extract_tags(text_lower: &str, memory_type: &str) -> Vec<String> {
216    let mut tags = Vec::new();
217
218    // Memory type → base tag
219    match memory_type {
220        "constraint" => tags.push("constraint".to_string()),
221        "decision" => tags.push("decision".to_string()),
222        "incident" => tags.push("incident".to_string()),
223        "workflow" => tags.push("workflow".to_string()),
224        "pattern" => tags.push("pattern".to_string()),
225        _ => {}
226    }
227
228    // Keyword-based tag inference
229    for (keywords, tag) in TAG_RULES {
230        if keywords.iter().any(|kw| text_lower.contains(kw)) && !tags.contains(&tag.to_string()) {
231            tags.push(tag.to_string());
232        }
233    }
234
235    // Cap at 4 tags
236    tags.truncate(4);
237    tags
238}
239
240/// Extract triggers: first 2-3 significant words from the title.
241fn extract_triggers(title: &str) -> Vec<String> {
242    let words: Vec<&str> = title
243        .split_whitespace()
244        .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_'))
245        .filter(|w| w.len() >= 2 && !STOP_WORDS.contains(&w.to_lowercase().as_str()))
246        .take(3)
247        .collect();
248
249    words.iter().map(|w| w.to_lowercase()).collect()
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255    use crate::domain::{
256        MemoryLifecycleState, MemoryOrigin, MemoryRecord, MemoryScope, MemorySourceKind,
257    };
258
259    fn make_record(title: &str, summary: &str, memory_type: &str) -> MemoryRecord {
260        MemoryRecord {
261            title: title.to_string(),
262            summary: summary.to_string(),
263            memory_type: memory_type.to_string(),
264            scope: MemoryScope::User,
265            state: MemoryLifecycleState::Accepted,
266            origin: MemoryOrigin {
267                source_kind: MemorySourceKind::Manual,
268                source_ref: "test".to_string(),
269            },
270            project_id: None,
271            user_id: None,
272            sensitivity: None,
273            entities: Vec::new(),
274            tags: Vec::new(),
275            triggers: Vec::new(),
276            related_files: Vec::new(),
277            related_records: Vec::new(),
278            supersedes: None,
279            applies_to: Vec::new(),
280            valid_until: None,
281        }
282    }
283
284    #[test]
285    fn enrich_should_extract_known_tools_as_entities() {
286        let record = make_record(
287            "Use PostgreSQL for persistence",
288            "Store lifecycle data in PostgreSQL with Docker for local dev",
289            "decision",
290        );
291        let patch = enrich_record(&record);
292        assert!(patch.entities.iter().any(|e| e == "PostgreSQL"));
293        assert!(patch.entities.iter().any(|e| e == "Docker"));
294    }
295
296    #[test]
297    fn enrich_should_extract_tags_from_keywords() {
298        let record = make_record(
299            "Database migration strategy",
300            "Always use reversible SQL migrations",
301            "decision",
302        );
303        let patch = enrich_record(&record);
304        assert!(patch.tags.contains(&"decision".to_string()));
305        assert!(patch.tags.contains(&"database".to_string()));
306    }
307
308    #[test]
309    fn enrich_should_extract_triggers_from_title() {
310        let record = make_record(
311            "Prefer immutable data structures",
312            "Use immutable patterns to avoid side effects",
313            "constraint",
314        );
315        let patch = enrich_record(&record);
316        assert!(!patch.triggers.is_empty());
317        assert!(patch.triggers.contains(&"prefer".to_string()));
318        assert!(patch.triggers.contains(&"immutable".to_string()));
319    }
320
321    #[test]
322    fn enrich_should_skip_fields_that_already_have_values() {
323        let mut record = make_record(
324            "Use Rust for CLI",
325            "Rust provides good performance",
326            "decision",
327        );
328        record.entities = vec!["Rust".to_string()];
329        record.tags = vec!["language".to_string()];
330        // triggers is still empty
331
332        let patch = enrich_record(&record);
333        assert!(
334            patch.entities.is_empty(),
335            "should skip entities since record already has them"
336        );
337        assert!(
338            patch.tags.is_empty(),
339            "should skip tags since record already has them"
340        );
341        assert!(!patch.triggers.is_empty(), "should still extract triggers");
342    }
343
344    #[test]
345    fn enrich_empty_patch_should_report_is_empty() {
346        let mut record = make_record("x", "y", "preference");
347        record.entities = vec!["a".to_string()];
348        record.tags = vec!["b".to_string()];
349        record.triggers = vec!["c".to_string()];
350        let patch = enrich_record(&record);
351        assert!(patch.is_empty());
352    }
353}