Skip to main content

semantic_diff/grouper/
mod.rs

1pub mod llm;
2
3use crate::diff::DiffData;
4use serde::Deserialize;
5
6/// Response envelope from LLM grouping request.
7#[derive(Debug, Clone, Deserialize)]
8pub struct GroupingResponse {
9    pub groups: Vec<SemanticGroup>,
10}
11
12/// A semantic group of related changes (hunk-level granularity).
13/// Accepts both `changes` (hunk-level) and `files` (file-level fallback) from LLM.
14#[derive(Debug, Clone, Deserialize)]
15pub struct SemanticGroup {
16    pub label: String,
17    #[serde(default)]
18    #[allow(dead_code)]
19    pub description: String,
20    /// Hunk-level changes (preferred format).
21    #[serde(default)]
22    changes: Vec<GroupedChange>,
23    /// File-level fallback: if LLM returns `"files": ["path"]` instead of `changes`.
24    #[serde(default)]
25    files: Vec<String>,
26}
27
28impl SemanticGroup {
29    /// Create a SemanticGroup from hunk-level changes.
30    pub fn new(label: String, description: String, changes: Vec<GroupedChange>) -> Self {
31        Self {
32            label,
33            description,
34            changes,
35            files: vec![],
36        }
37    }
38
39    /// Get the list of changes, normalizing the `files` fallback into `changes`.
40    pub fn changes(&self) -> Vec<GroupedChange> {
41        if !self.changes.is_empty() {
42            return self.changes.clone();
43        }
44        // Fallback: convert file-level list to changes with empty hunks (= all hunks)
45        self.files
46            .iter()
47            .map(|f| GroupedChange {
48                file: f.clone(),
49                hunks: vec![],
50            })
51            .collect()
52    }
53}
54
55/// A reference to specific hunks within a file that belong to a group.
56#[derive(Debug, Clone, Deserialize)]
57pub struct GroupedChange {
58    pub file: String,
59    /// 0-based hunk indices. If empty, means all hunks in the file.
60    #[serde(default)]
61    pub hunks: Vec<usize>,
62}
63
64/// Tracks the lifecycle of an async grouping request.
65#[derive(Debug, Clone, PartialEq)]
66pub enum GroupingStatus {
67    /// No grouping attempted yet (or no LLM backend available).
68    Idle,
69    /// Waiting for LLM response.
70    Loading,
71    /// Groups received and applied.
72    Done,
73    /// LLM call failed (timeout, parse error, etc.).
74    Error(String),
75}
76
77/// Build hunk-level summaries for the LLM prompt from parsed diff data.
78///
79/// Format:
80/// ```text
81/// FILE: src/app.rs (modified, +10 -3)
82///   HUNK 0: @@ -100,6 +100,16 @@ impl App
83///     + pub fn new_method() {
84///     + ...
85///   HUNK 1: @@ -200,3 +210,5 @@ fn handle_key
86///     - old_call();
87///     + new_call();
88/// ```
89/// Max total characters for the summaries prompt to keep LLM response fast.
90const MAX_SUMMARY_CHARS: usize = 8000;
91
92pub fn hunk_summaries(diff_data: &DiffData) -> String {
93    let mut out = String::new();
94    for f in &diff_data.files {
95        let path = f.target_file.trim_start_matches("b/");
96        let status = if f.is_rename {
97            format!("renamed from {}", f.source_file.trim_start_matches("a/"))
98        } else if f.added_count > 0 && f.removed_count == 0 {
99            "added".to_string()
100        } else if f.removed_count > 0 && f.added_count == 0 {
101            "deleted".to_string()
102        } else {
103            "modified".to_string()
104        };
105        out.push_str(&format!(
106            "FILE: {} ({}, +{} -{})\n",
107            path, status, f.added_count, f.removed_count
108        ));
109
110        for (hi, hunk) in f.hunks.iter().enumerate() {
111            out.push_str(&format!("  HUNK {}: {}\n", hi, hunk.header));
112
113            // Include a brief sample of changed lines (up to 4 lines) if under budget
114            if out.len() < MAX_SUMMARY_CHARS {
115                let mut shown = 0;
116                for line in &hunk.lines {
117                    if shown >= 4 {
118                        out.push_str("    ...\n");
119                        break;
120                    }
121                    match line.line_type {
122                        crate::diff::LineType::Added => {
123                            out.push_str(&format!("    + {}\n", truncate(&line.content, 60)));
124                            shown += 1;
125                        }
126                        crate::diff::LineType::Removed => {
127                            out.push_str(&format!("    - {}\n", truncate(&line.content, 60)));
128                            shown += 1;
129                        }
130                        _ => {}
131                    }
132                }
133            }
134        }
135
136        if out.len() >= MAX_SUMMARY_CHARS {
137            out.push_str("... (remaining files omitted for brevity)\n");
138            break;
139        }
140    }
141    out
142}
143
144/// Truncate a string to at most `max` bytes, respecting UTF-8 char boundaries.
145/// Returns a string slice that is always valid UTF-8.
146fn truncate(s: &str, max: usize) -> &str {
147    if s.len() <= max {
148        s
149    } else {
150        // Find the largest char boundary <= max
151        let mut end = max;
152        while end > 0 && !s.is_char_boundary(end) {
153            end -= 1;
154        }
155        &s[..end]
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162
163    #[test]
164    fn test_truncate_ascii() {
165        assert_eq!(truncate("hello", 3), "hel");
166    }
167
168    #[test]
169    fn test_truncate_shorter_than_max() {
170        assert_eq!(truncate("hi", 10), "hi");
171    }
172
173    #[test]
174    fn test_truncate_cjk_at_boundary_no_panic() {
175        // CJK characters are 3 bytes each in UTF-8
176        let s = "\u{4e16}\u{754c}\u{4f60}\u{597d}"; // 世界你好 (12 bytes)
177        // Truncating at byte 4 should not panic -- it should back up to byte 3
178        let result = truncate(s, 4);
179        assert_eq!(result, "\u{4e16}"); // 世 (3 bytes)
180    }
181
182    #[test]
183    fn test_truncate_emoji_at_boundary_no_panic() {
184        // Emoji like 🦀 are 4 bytes in UTF-8
185        let s = "a🦀b"; // 1 + 4 + 1 = 6 bytes
186        // Truncating at byte 3 (middle of emoji) should not panic
187        let result = truncate(s, 3);
188        assert_eq!(result, "a"); // backs up to byte 1
189    }
190
191    #[test]
192    fn test_truncate_exact_boundary() {
193        assert_eq!(truncate("hello", 5), "hello");
194    }
195
196    #[test]
197    fn test_truncate_zero() {
198        assert_eq!(truncate("hello", 0), "");
199    }
200}