Skip to main content

retro_core/analysis/
merge.rs

1use crate::models::{Pattern, PatternStatus, PatternUpdate};
2use chrono::Utc;
3use uuid::Uuid;
4
5/// Threshold for Levenshtein similarity — above this, merge instead of creating new.
6const SIMILARITY_THRESHOLD: f64 = 0.8;
7
8/// Process AI-returned pattern updates against existing patterns.
9/// Returns (new patterns to insert, updates to apply to existing patterns).
10pub fn process_updates(
11    updates: Vec<PatternUpdate>,
12    existing: &[Pattern],
13    project: Option<&str>,
14) -> (Vec<Pattern>, Vec<MergeUpdate>) {
15    let mut new_patterns = Vec::new();
16    let mut merge_updates = Vec::new();
17    let now = Utc::now();
18
19    for update in updates {
20        match update {
21            PatternUpdate::New(new) => {
22                // Safety net: check if this is a near-duplicate of an existing pattern
23                if let Some(match_id) = find_similar_pattern(&new.description, existing) {
24                    // Merge into existing instead of creating new
25                    merge_updates.push(MergeUpdate {
26                        pattern_id: match_id,
27                        new_sessions: new.source_sessions,
28                        new_confidence: new.confidence,
29                        additional_times_seen: 1,
30                    });
31                } else {
32                    // Genuinely new pattern
33                    let pattern = Pattern {
34                        id: Uuid::new_v4().to_string(),
35                        pattern_type: new.pattern_type,
36                        description: new.description,
37                        confidence: new.confidence,
38                        times_seen: 1,
39                        first_seen: now,
40                        last_seen: now,
41                        last_projected: None,
42                        status: PatternStatus::Discovered,
43                        source_sessions: new.source_sessions,
44                        related_files: new.related_files,
45                        suggested_content: new.suggested_content,
46                        suggested_target: new.suggested_target,
47                        project: project.map(String::from),
48                        generation_failed: false,
49                    };
50                    new_patterns.push(pattern);
51                }
52            }
53            PatternUpdate::Update(upd) => {
54                // Verify the referenced pattern exists
55                if existing.iter().any(|p| p.id == upd.existing_id) {
56                    merge_updates.push(MergeUpdate {
57                        pattern_id: upd.existing_id,
58                        new_sessions: upd.new_sessions,
59                        new_confidence: upd.new_confidence,
60                        additional_times_seen: 1,
61                    });
62                } else {
63                    eprintln!(
64                        "warning: AI referenced non-existent pattern ID: {}",
65                        upd.existing_id
66                    );
67                }
68            }
69        }
70    }
71
72    (new_patterns, merge_updates)
73}
74
75/// A merge update to apply to an existing pattern in the DB.
76pub struct MergeUpdate {
77    pub pattern_id: String,
78    pub new_sessions: Vec<String>,
79    pub new_confidence: f64,
80    pub additional_times_seen: i64,
81}
82
83/// Find an existing pattern with description similarity > threshold.
84/// Returns the ID of the best match, if any.
85fn find_similar_pattern(description: &str, existing: &[Pattern]) -> Option<String> {
86    let mut best_match: Option<(String, f64)> = None;
87
88    for pattern in existing {
89        let similarity = normalized_similarity(description, &pattern.description);
90        if similarity > SIMILARITY_THRESHOLD {
91            match &best_match {
92                Some((_, best_sim)) if similarity > *best_sim => {
93                    best_match = Some((pattern.id.clone(), similarity));
94                }
95                None => {
96                    best_match = Some((pattern.id.clone(), similarity));
97                }
98                _ => {}
99            }
100        }
101    }
102
103    best_match.map(|(id, _)| id)
104}
105
106/// Compute normalized Levenshtein similarity between two strings.
107/// Returns a value in [0.0, 1.0] where 1.0 means identical.
108pub fn normalized_similarity(a: &str, b: &str) -> f64 {
109    let a_chars: Vec<char> = a.to_lowercase().chars().collect();
110    let b_chars: Vec<char> = b.to_lowercase().chars().collect();
111    let a_len = a_chars.len();
112    let b_len = b_chars.len();
113
114    let max_len = std::cmp::max(a_len, b_len);
115    if max_len == 0 {
116        return 1.0;
117    }
118
119    let distance = levenshtein_distance(&a_chars, &b_chars);
120    1.0 - (distance as f64 / max_len as f64)
121}
122
123fn levenshtein_distance(a: &[char], b: &[char]) -> usize {
124    let a_len = a.len();
125    let b_len = b.len();
126
127    if a_len == 0 {
128        return b_len;
129    }
130    if b_len == 0 {
131        return a_len;
132    }
133
134    // Two-row optimization
135    let mut prev: Vec<usize> = (0..=b_len).collect();
136    let mut curr = vec![0; b_len + 1];
137
138    for (i, a_ch) in a.iter().enumerate() {
139        curr[0] = i + 1;
140        for (j, b_ch) in b.iter().enumerate() {
141            let cost = if a_ch == b_ch { 0 } else { 1 };
142            curr[j + 1] = std::cmp::min(
143                std::cmp::min(prev[j + 1] + 1, curr[j] + 1),
144                prev[j] + cost,
145            );
146        }
147        std::mem::swap(&mut prev, &mut curr);
148    }
149
150    prev[b_len]
151}
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156
157    #[test]
158    fn test_identical_strings() {
159        assert!((normalized_similarity("hello", "hello") - 1.0).abs() < f64::EPSILON);
160    }
161
162    #[test]
163    fn test_completely_different() {
164        let sim = normalized_similarity("abc", "xyz");
165        assert!(sim < 0.5);
166    }
167
168    #[test]
169    fn test_similar_strings() {
170        let sim = normalized_similarity(
171            "Always use uv for Python packages",
172            "Always use uv for Python package management",
173        );
174        assert!(sim > 0.7);
175    }
176
177    #[test]
178    fn test_empty_strings() {
179        assert!((normalized_similarity("", "") - 1.0).abs() < f64::EPSILON);
180    }
181
182    #[test]
183    fn test_case_insensitive() {
184        assert!((normalized_similarity("Hello World", "hello world") - 1.0).abs() < f64::EPSILON);
185    }
186}