semantic_diff/grouper/
mod.rs

1pub mod llm;
2
3use crate::diff::DiffData;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::collections::hash_map::DefaultHasher;
7use std::hash::{Hash, Hasher};
8
9/// Response envelope from LLM grouping request.
10#[derive(Debug, Clone, Deserialize)]
11pub struct GroupingResponse {
12    pub groups: Vec<SemanticGroup>,
13}
14
15/// A semantic group of related changes (hunk-level granularity).
16/// Accepts both `changes` (hunk-level) and `files` (file-level fallback) from LLM.
17#[derive(Debug, Clone, Deserialize)]
18pub struct SemanticGroup {
19    pub label: String,
20    #[serde(default)]
21    #[allow(dead_code)]
22    pub description: String,
23    /// Hunk-level changes (preferred format).
24    #[serde(default)]
25    changes: Vec<GroupedChange>,
26    /// File-level fallback: if LLM returns `"files": ["path"]` instead of `changes`.
27    #[serde(default)]
28    files: Vec<String>,
29}
30
31impl SemanticGroup {
32    /// Create a SemanticGroup from hunk-level changes.
33    pub fn new(label: String, description: String, changes: Vec<GroupedChange>) -> Self {
34        Self {
35            label,
36            description,
37            changes,
38            files: vec![],
39        }
40    }
41
42    /// Replace the changes list directly.
43    pub fn set_changes(&mut self, changes: Vec<GroupedChange>) {
44        self.changes = changes;
45        self.files.clear();
46    }
47
48    /// Get the list of changes, normalizing the `files` fallback into `changes`.
49    pub fn changes(&self) -> Vec<GroupedChange> {
50        if !self.changes.is_empty() {
51            return self.changes.clone();
52        }
53        // Fallback: convert file-level list to changes with empty hunks (= all hunks)
54        self.files
55            .iter()
56            .map(|f| GroupedChange {
57                file: f.clone(),
58                hunks: vec![],
59            })
60            .collect()
61    }
62}
63
64/// A reference to specific hunks within a file that belong to a group.
65#[derive(Debug, Clone, Deserialize)]
66pub struct GroupedChange {
67    pub file: String,
68    /// 0-based hunk indices. If empty, means all hunks in the file.
69    #[serde(default)]
70    pub hunks: Vec<usize>,
71}
72
73/// Tracks the lifecycle of an async grouping request.
74#[derive(Debug, Clone, PartialEq)]
75pub enum GroupingStatus {
76    /// No grouping attempted yet (or no LLM backend available).
77    Idle,
78    /// Waiting for LLM response.
79    Loading,
80    /// Groups received and applied.
81    Done,
82    /// LLM call failed (timeout, parse error, etc.).
83    Error(String),
84}
85
86/// Build hunk-level summaries for the LLM prompt from parsed diff data.
87///
88/// Format:
89/// ```text
90/// FILE: src/app.rs (modified, +10 -3)
91///   HUNK 0: @@ -100,6 +100,16 @@ impl App
92///     + pub fn new_method() {
93///     + ...
94///   HUNK 1: @@ -200,3 +210,5 @@ fn handle_key
95///     - old_call();
96///     + new_call();
97/// ```
98/// Max total characters for the summaries prompt to keep LLM response fast.
99const MAX_SUMMARY_CHARS: usize = 8000;
100
101pub fn hunk_summaries(diff_data: &DiffData) -> String {
102    let mut out = String::new();
103    for f in &diff_data.files {
104        let path = f.target_file.trim_start_matches("b/");
105        let status = if f.is_rename {
106            format!("renamed from {}", f.source_file.trim_start_matches("a/"))
107        } else if f.added_count > 0 && f.removed_count == 0 {
108            "added".to_string()
109        } else if f.removed_count > 0 && f.added_count == 0 {
110            "deleted".to_string()
111        } else {
112            "modified".to_string()
113        };
114        out.push_str(&format!(
115            "FILE: {} ({}, +{} -{})\n",
116            path, status, f.added_count, f.removed_count
117        ));
118
119        for (hi, hunk) in f.hunks.iter().enumerate() {
120            out.push_str(&format!("  HUNK {}: {}\n", hi, hunk.header));
121
122            // Include a brief sample of changed lines (up to 4 lines) if under budget
123            if out.len() < MAX_SUMMARY_CHARS {
124                let mut shown = 0;
125                for line in &hunk.lines {
126                    if shown >= 4 {
127                        out.push_str("    ...\n");
128                        break;
129                    }
130                    match line.line_type {
131                        crate::diff::LineType::Added => {
132                            out.push_str(&format!("    + {}\n", truncate(&line.content, 60)));
133                            shown += 1;
134                        }
135                        crate::diff::LineType::Removed => {
136                            out.push_str(&format!("    - {}\n", truncate(&line.content, 60)));
137                            shown += 1;
138                        }
139                        _ => {}
140                    }
141                }
142            }
143        }
144
145        if out.len() >= MAX_SUMMARY_CHARS {
146            out.push_str("... (remaining files omitted for brevity)\n");
147            break;
148        }
149    }
150    out
151}
152
153/// Compute a stable hash of a file's diff content (hunk headers + line types + line content).
154/// Used to detect whether a file's diff has changed between refreshes.
155pub fn compute_file_hash(file: &crate::diff::DiffFile) -> u64 {
156    let mut hasher = DefaultHasher::new();
157    for hunk in &file.hunks {
158        hunk.header.hash(&mut hasher);
159        for line in &hunk.lines {
160            // Discriminant: 0 = Added, 1 = Removed, 2 = Context
161            let discriminant: u8 = match line.line_type {
162                crate::diff::LineType::Added => 0,
163                crate::diff::LineType::Removed => 1,
164                crate::diff::LineType::Context => 2,
165            };
166            discriminant.hash(&mut hasher);
167            line.content.hash(&mut hasher);
168        }
169    }
170    hasher.finish()
171}
172
173/// Compute hashes for all files in a diff. Key = file path with `b/` prefix stripped.
174pub fn compute_all_file_hashes(diff_data: &DiffData) -> HashMap<String, u64> {
175    diff_data
176        .files
177        .iter()
178        .map(|f| {
179            let path = f.target_file.trim_start_matches("b/").to_string();
180            (path, compute_file_hash(f))
181        })
182        .collect()
183}
184
185/// Categorization of files between two diff snapshots.
186#[derive(Debug, Clone, Serialize)]
187pub struct DiffDelta {
188    /// Files that are new (not in previous grouping).
189    pub new_files: Vec<String>,
190    /// Files that were removed (in previous but not in new diff).
191    pub removed_files: Vec<String>,
192    /// Files whose diff content changed.
193    pub modified_files: Vec<String>,
194    /// Files whose diff content is identical.
195    pub unchanged_files: Vec<String>,
196}
197
198impl DiffDelta {
199    pub fn has_changes(&self) -> bool {
200        !self.new_files.is_empty()
201            || !self.removed_files.is_empty()
202            || !self.modified_files.is_empty()
203    }
204
205    pub fn is_only_removals(&self) -> bool {
206        self.new_files.is_empty()
207            && self.modified_files.is_empty()
208            && !self.removed_files.is_empty()
209    }
210}
211
212/// Compare new file hashes against previous to categorize each file.
213pub fn compute_diff_delta(
214    new_hashes: &HashMap<String, u64>,
215    previous_hashes: &HashMap<String, u64>,
216) -> DiffDelta {
217    let mut new_files = Vec::new();
218    let mut modified_files = Vec::new();
219    let mut unchanged_files = Vec::new();
220
221    for (path, &new_hash) in new_hashes {
222        match previous_hashes.get(path) {
223            None => new_files.push(path.clone()),
224            Some(&prev_hash) if prev_hash != new_hash => modified_files.push(path.clone()),
225            _ => unchanged_files.push(path.clone()),
226        }
227    }
228
229    let removed_files = previous_hashes
230        .keys()
231        .filter(|p| !new_hashes.contains_key(*p))
232        .cloned()
233        .collect();
234
235    DiffDelta {
236        new_files,
237        removed_files,
238        modified_files,
239        unchanged_files,
240    }
241}
242
243/// Build hunk summaries for ONLY new/modified files, prepended with existing group context.
244///
245/// Format:
246/// ```text
247/// EXISTING GROUPS (for context — assign new changes to these or create new groups):
248/// 1. "Auth refactor" — files: src/auth.rs, src/middleware.rs
249///
250/// NEW/MODIFIED FILES TO GROUP:
251/// FILE: src/router.rs (added, +20 -0)
252///   HUNK 0: @@ ...
253///     + pub fn new_route() {
254/// ```
255pub fn incremental_hunk_summaries(
256    diff_data: &DiffData,
257    delta: &DiffDelta,
258    existing_groups: &[SemanticGroup],
259) -> String {
260    let mut out = String::new();
261
262    // --- Existing group context ---
263    if !existing_groups.is_empty() {
264        out.push_str(
265            "EXISTING GROUPS (for context \u{2014} assign new changes to these or create new groups):\n",
266        );
267        for (i, group) in existing_groups.iter().enumerate() {
268            let changes = group.changes();
269            let file_list: Vec<&str> = changes.iter().map(|c| c.file.as_str()).collect();
270            out.push_str(&format!(
271                "{}. \"{}\" \u{2014} files: {}\n",
272                i + 1,
273                group.label,
274                file_list.join(", ")
275            ));
276        }
277        out.push('\n');
278    }
279
280    out.push_str("NEW/MODIFIED FILES TO GROUP:\n");
281
282    // Collect the set of files to include (new + modified)
283    let include: std::collections::HashSet<&str> = delta
284        .new_files
285        .iter()
286        .chain(delta.modified_files.iter())
287        .map(|s| s.as_str())
288        .collect();
289
290    for f in &diff_data.files {
291        let path = f.target_file.trim_start_matches("b/");
292        if !include.contains(path) {
293            continue;
294        }
295
296        let status = if f.is_rename {
297            format!("renamed from {}", f.source_file.trim_start_matches("a/"))
298        } else if f.added_count > 0 && f.removed_count == 0 {
299            "added".to_string()
300        } else if f.removed_count > 0 && f.added_count == 0 {
301            "deleted".to_string()
302        } else {
303            "modified".to_string()
304        };
305        out.push_str(&format!(
306            "FILE: {} ({}, +{} -{})\n",
307            path, status, f.added_count, f.removed_count
308        ));
309
310        for (hi, hunk) in f.hunks.iter().enumerate() {
311            out.push_str(&format!("  HUNK {}: {}\n", hi, hunk.header));
312
313            if out.len() < MAX_SUMMARY_CHARS {
314                let mut shown = 0;
315                for line in &hunk.lines {
316                    if shown >= 4 {
317                        out.push_str("    ...\n");
318                        break;
319                    }
320                    match line.line_type {
321                        crate::diff::LineType::Added => {
322                            out.push_str(&format!("    + {}\n", truncate(&line.content, 60)));
323                            shown += 1;
324                        }
325                        crate::diff::LineType::Removed => {
326                            out.push_str(&format!("    - {}\n", truncate(&line.content, 60)));
327                            shown += 1;
328                        }
329                        _ => {}
330                    }
331                }
332            }
333        }
334
335        if out.len() >= MAX_SUMMARY_CHARS {
336            out.push_str("... (remaining files omitted for brevity)\n");
337            break;
338        }
339    }
340
341    out
342}
343
344/// Post-process grouping results: fill in explicit hunk indices when `hunks` is empty
345/// and the file has multiple hunks, so the UI can filter hunks per group correctly.
346pub fn normalize_hunk_indices(groups: &mut [SemanticGroup], diff_data: &DiffData) {
347    // Build a map from file path -> hunk count
348    let hunk_counts: HashMap<String, usize> = diff_data
349        .files
350        .iter()
351        .map(|f| {
352            let path = f.target_file.trim_start_matches("b/").to_string();
353            (path, f.hunks.len())
354        })
355        .collect();
356
357    for group in groups.iter_mut() {
358        let mut updated = group.changes();
359        for change in updated.iter_mut() {
360            if change.hunks.is_empty() {
361                if let Some(&count) = hunk_counts.get(&change.file) {
362                    if count > 1 {
363                        change.hunks = (0..count).collect();
364                    }
365                }
366            }
367        }
368        group.set_changes(updated);
369    }
370}
371
372/// Remove all entries for the given file paths from existing groups.
373/// Groups that become empty after removal are dropped.
374pub fn remove_files_from_groups(groups: &mut Vec<SemanticGroup>, files_to_remove: &[String]) {
375    if files_to_remove.is_empty() {
376        return;
377    }
378    let remove_set: std::collections::HashSet<&str> =
379        files_to_remove.iter().map(|s| s.as_str()).collect();
380
381    groups.retain_mut(|group| {
382        let filtered: Vec<GroupedChange> = group
383            .changes()
384            .into_iter()
385            .filter(|c| !remove_set.contains(c.file.as_str()))
386            .collect();
387        group.set_changes(filtered);
388        !group.changes().is_empty()
389    });
390}
391
392/// Merge new LLM grouping assignments into existing groups.
393///
394/// Steps:
395/// 1. Clone existing groups.
396/// 2. Remove entries for `removed_files` and `modified_files` (stale data).
397/// 3. For each group in `new_assignments`:
398///    - If label matches an existing group (case-insensitive), merge changes into it.
399///    - Otherwise, append as a new group.
400/// 4. Remove empty groups.
401pub fn merge_groups(
402    existing: &[SemanticGroup],
403    new_assignments: &[SemanticGroup],
404    delta: &DiffDelta,
405) -> Vec<SemanticGroup> {
406    let mut merged: Vec<SemanticGroup> = existing.to_vec();
407
408    // Remove stale file entries
409    let stale: Vec<String> = delta
410        .removed_files
411        .iter()
412        .chain(delta.modified_files.iter())
413        .cloned()
414        .collect();
415    remove_files_from_groups(&mut merged, &stale);
416
417    // Integrate new assignments
418    for new_group in new_assignments {
419        let new_changes = new_group.changes();
420        if new_changes.is_empty() {
421            continue;
422        }
423
424        // Find existing group with matching label (case-insensitive)
425        let existing_pos = merged
426            .iter()
427            .position(|g| g.label.to_lowercase() == new_group.label.to_lowercase());
428
429        if let Some(pos) = existing_pos {
430            let mut combined = merged[pos].changes();
431            combined.extend(new_changes);
432            merged[pos].set_changes(combined);
433        } else {
434            merged.push(new_group.clone());
435        }
436    }
437
438    // Drop any groups that ended up empty
439    merged.retain(|g| !g.changes().is_empty());
440
441    merged
442}
443
444/// Truncate a string to at most `max` bytes, respecting UTF-8 char boundaries.
445/// Returns a string slice that is always valid UTF-8.
446fn truncate(s: &str, max: usize) -> &str {
447    if s.len() <= max {
448        s
449    } else {
450        // Find the largest char boundary <= max
451        let mut end = max;
452        while end > 0 && !s.is_char_boundary(end) {
453            end -= 1;
454        }
455        &s[..end]
456    }
457}
458
459#[cfg(test)]
460mod tests {
461    use super::*;
462
463    #[test]
464    fn test_truncate_ascii() {
465        assert_eq!(truncate("hello", 3), "hel");
466    }
467
468    #[test]
469    fn test_truncate_shorter_than_max() {
470        assert_eq!(truncate("hi", 10), "hi");
471    }
472
473    #[test]
474    fn test_truncate_cjk_at_boundary_no_panic() {
475        // CJK characters are 3 bytes each in UTF-8
476        let s = "\u{4e16}\u{754c}\u{4f60}\u{597d}"; // 世界你好 (12 bytes)
477        // Truncating at byte 4 should not panic -- it should back up to byte 3
478        let result = truncate(s, 4);
479        assert_eq!(result, "\u{4e16}"); // 世 (3 bytes)
480    }
481
482    #[test]
483    fn test_truncate_emoji_at_boundary_no_panic() {
484        // Emoji like 🦀 are 4 bytes in UTF-8
485        let s = "a🦀b"; // 1 + 4 + 1 = 6 bytes
486        // Truncating at byte 3 (middle of emoji) should not panic
487        let result = truncate(s, 3);
488        assert_eq!(result, "a"); // backs up to byte 1
489    }
490
491    #[test]
492    fn test_truncate_exact_boundary() {
493        assert_eq!(truncate("hello", 5), "hello");
494    }
495
496    #[test]
497    fn test_truncate_zero() {
498        assert_eq!(truncate("hello", 0), "");
499    }
500}
semantic_diff/grouper/mod.rs

semantic_diff/grouper/
mod.rs