Skip to main content

ralph_workflow/phases/commit/
diff_truncation.rs

1/// Maximum safe prompt size in bytes before pre-truncation.
2const MAX_SAFE_PROMPT_SIZE: u64 = 200_000;
3
4/// Maximum prompt size for GLM-like agents (GLM, Zhipu, Qwen, DeepSeek).
5const GLM_MAX_PROMPT_SIZE: u64 = 100_000;
6
7/// Maximum prompt size for Claude-based agents.
8const CLAUDE_MAX_PROMPT_SIZE: u64 = 300_000;
9
10/// Get the maximum safe prompt size for a specific agent.
11pub fn model_budget_bytes_for_agent_name(commit_agent: &str) -> u64 {
12    let agent_lower = commit_agent.to_lowercase();
13
14    if agent_lower.contains("glm")
15        || agent_lower.contains("zhipuai")
16        || agent_lower.contains("zai")
17        || agent_lower.contains("qwen")
18        || agent_lower.contains("deepseek")
19    {
20        GLM_MAX_PROMPT_SIZE
21    } else if agent_lower.contains("claude")
22        || agent_lower.contains("ccs")
23        || agent_lower.contains("anthropic")
24    {
25        CLAUDE_MAX_PROMPT_SIZE
26    } else {
27        MAX_SAFE_PROMPT_SIZE
28    }
29}
30
31pub fn effective_model_budget_bytes(agent_names: &[String]) -> u64 {
32    agent_names
33        .iter()
34        .map(|name| model_budget_bytes_for_agent_name(name))
35        .min()
36        .unwrap_or(MAX_SAFE_PROMPT_SIZE)
37}
38
39/// Truncate diff if it's too large for agents with small context windows.
40fn truncate_diff_if_large(diff: &str, max_size: usize) -> String {
41    if diff.len() <= max_size {
42        return diff.to_string();
43    }
44
45    let mut files: Vec<DiffFile> = Vec::new();
46    let mut current_file = DiffFile::default();
47    let mut in_file = false;
48
49    for line in diff.lines() {
50        if line.starts_with("diff --git ") {
51            if in_file && !current_file.lines.is_empty() {
52                files.push(std::mem::take(&mut current_file));
53            }
54            in_file = true;
55            current_file.lines.push(line.to_string());
56
57            if let Some(path) = line.split(" b/").nth(1) {
58                current_file.path = path.to_string();
59                current_file.priority = prioritize_file_path(path);
60            }
61        } else if in_file {
62            current_file.lines.push(line.to_string());
63        }
64    }
65
66    if in_file && !current_file.lines.is_empty() {
67        files.push(current_file);
68    }
69
70    files.sort_by(|a, b| b.priority.cmp(&a.priority));
71
72    let mut result = String::new();
73    let mut current_size = 0;
74    let mut files_included = 0;
75    let total_files = files.len();
76
77    for file in &files {
78        let file_size: usize = file.lines.iter().map(|l| l.len() + 1).sum();
79
80        if current_size + file_size <= max_size {
81            for line in &file.lines {
82                result.push_str(line);
83                result.push('\n');
84            }
85            current_size += file_size;
86            files_included += 1;
87        } else if files_included == 0 {
88            let truncated_lines = truncate_lines_to_fit(&file.lines, max_size);
89            for line in truncated_lines {
90                result.push_str(&line);
91                result.push('\n');
92            }
93            files_included = 1;
94            break;
95        } else {
96            break;
97        }
98    }
99
100    if files_included < total_files {
101        let summary = format!(
102            "\n[Truncated: {} of {} files shown]\n",
103            files_included, total_files
104        );
105        if summary.len() <= max_size {
106            if result.len() + summary.len() <= max_size {
107                result.push_str(&summary);
108            } else {
109                let target_bytes = max_size.saturating_sub(summary.len());
110                if target_bytes < result.len() {
111                    let mut cut = 0usize;
112                    for (idx, _) in result.char_indices() {
113                        if idx > target_bytes {
114                            break;
115                        }
116                        cut = idx;
117                    }
118                    result.truncate(cut);
119                }
120                result.push_str(&summary);
121            }
122        }
123    }
124
125    result
126}
127
128pub fn truncate_diff_to_model_budget(diff: &str, max_size_bytes: u64) -> (String, bool) {
129    let max_size = usize::try_from(max_size_bytes).unwrap_or(usize::MAX);
130    if diff.len() <= max_size {
131        (diff.to_string(), false)
132    } else {
133        (truncate_diff_if_large(diff, max_size), true)
134    }
135}
136
137#[derive(Default)]
138struct DiffFile {
139    path: String,
140    priority: i32,
141    lines: Vec<String>,
142}
143
144fn prioritize_file_path(path: &str) -> i32 {
145    let normalized = path.replace('\\', "/");
146    let parts: Vec<&str> = normalized.split('/').filter(|p| !p.is_empty()).collect();
147
148    if parts.contains(&"src") {
149        100
150    } else if parts.contains(&"tests") {
151        50
152    } else if normalized.ends_with(".md") || normalized.ends_with(".txt") {
153        10
154    } else {
155        0
156    }
157}
158
159fn truncate_lines_to_fit(lines: &[String], max_size: usize) -> Vec<String> {
160    let mut result = Vec::new();
161    let mut current_size = 0;
162
163    for line in lines {
164        let line_size = line.len() + 1;
165        if current_size + line_size <= max_size {
166            current_size += line_size;
167            result.push(line.clone());
168        } else {
169            break;
170        }
171    }
172
173    let suffix = " [truncated...]";
174    let suffix_len = suffix.len();
175
176    fn truncate_to_utf8_boundary(s: &mut String, max_bytes: usize) {
177        if s.len() <= max_bytes {
178            return;
179        }
180        let mut cut = 0usize;
181        for (idx, _) in s.char_indices() {
182            if idx > max_bytes {
183                break;
184            }
185            cut = idx;
186        }
187        s.truncate(cut);
188    }
189
190    if !result.is_empty() {
191        // current_size tracks line lengths + '\n' for each included line.
192        // Appending the suffix increases size; ensure we stay within max_size
193        // by trimming from the last included line if needed.
194        let mut total_size = current_size;
195        while !result.is_empty() && total_size + suffix_len > max_size {
196            let last_len = result.last().expect("checked non-empty").len();
197            let excess = total_size + suffix_len - max_size;
198            if excess < last_len {
199                let new_len = last_len - excess;
200                let last = result.last_mut().expect("checked non-empty");
201                truncate_to_utf8_boundary(last, new_len);
202                break;
203            }
204            // Can't trim enough from last line; drop it and retry.
205            let dropped = result.pop().expect("checked non-empty");
206            total_size = total_size.saturating_sub(dropped.len() + 1);
207        }
208
209        if let Some(last) = result.last_mut() {
210            last.push_str(suffix);
211        }
212    }
213
214    result
215}
216
217#[cfg(test)]
218mod diff_truncation_tests {
219    use super::*;
220
221    #[test]
222    fn prioritize_file_path_handles_crate_prefixed_paths() {
223        // Real diffs in this repo often include crate-prefixed paths like `ralph-workflow/src/...`.
224        // These should still be treated as high-priority source changes.
225        assert_eq!(prioritize_file_path("ralph-workflow/src/lib.rs"), 100);
226        assert_eq!(prioritize_file_path("ralph-workflow/tests/integration.rs"), 50);
227        assert_eq!(prioritize_file_path("README.md"), 10);
228    }
229
230    #[test]
231    fn truncate_diff_to_model_budget_never_exceeds_max_size() {
232        let files_included = 1;
233        let total_files = 2;
234        let summary = format!(
235            "\n[Truncated: {} of {} files shown]\n",
236            files_included, total_files
237        );
238
239        let max_size = 1_000usize;
240
241        // Craft a diff where:
242        // - file 1 fits within max_size
243        // - file 2 does not fit, so a truncation summary is appended
244        // - file 1 content is sized so adding summary would exceed max_size
245        let file1_header = "diff --git a/src/a.rs b/src/a.rs";
246        let desired_file1_size = max_size - summary.len() + 1;
247        let filler_line_len = desired_file1_size.saturating_sub(file1_header.len() + 2);
248        let file1 = format!(
249            "{file1_header}\n+{}\n",
250            "x".repeat(filler_line_len.saturating_sub(1))
251        );
252
253        let file2 = "diff --git a/tests/b.rs b/tests/b.rs\n+small\n";
254        let diff = format!("{file1}{file2}");
255
256        let (truncated, was_truncated) = truncate_diff_to_model_budget(&diff, max_size as u64);
257        assert!(was_truncated, "expected truncation when diff exceeds max size");
258        assert!(
259            truncated.len() <= max_size,
260            "truncated diff must not exceed max_size (got {} > {})",
261            truncated.len(),
262            max_size
263        );
264    }
265
266    #[test]
267    fn truncate_lines_to_fit_reserves_space_for_truncation_suffix() {
268        // Regression test: truncate_lines_to_fit() used to append " [truncated...]" after
269        // selecting lines that fit max_size, which could push the final output over the
270        // intended max_size budget.
271        let max_size = 20usize;
272        let lines = vec!["x".repeat(max_size - 1)];
273
274        let truncated = truncate_lines_to_fit(&lines, max_size);
275
276        let total_size: usize = truncated.iter().map(|l| l.len() + 1).sum();
277        assert!(
278            total_size <= max_size,
279            "truncate_lines_to_fit must not exceed max_size after adding suffix (got {total_size} > {max_size})"
280        );
281    }
282
283    // =========================================================================
284    // Exhaustive edge case tests for truncation invariants
285    // =========================================================================
286
287    /// Test that truncation output never exceeds max_size for various edge cases.
288    ///
289    /// This exhaustively tests boundary conditions around the truncation summary
290    /// appending logic to ensure the invariant "output.len() <= max_size" holds.
291    #[test]
292    fn truncate_diff_invariant_never_exceeds_max_size_edge_cases() {
293        // Test various max_size values around the summary length
294        let summary_len = "\n[Truncated: 1 of 2 files shown]\n".len();
295
296        for max_size in [
297            10,                 // Very small
298            summary_len - 1,    // Just under summary
299            summary_len,        // Exactly summary
300            summary_len + 1,    // Just over summary
301            summary_len + 10,   // Summary + small content
302            100,                // Reasonable small size
303            1000,               // Reasonable larger size
304        ] {
305            let file1 = format!(
306                "diff --git a/src/a.rs b/src/a.rs\n+{}\n",
307                "x".repeat(max_size)
308            );
309            let file2 = "diff --git a/tests/b.rs b/tests/b.rs\n+extra\n";
310            let diff = format!("{file1}{file2}");
311
312            let (truncated, _) = truncate_diff_to_model_budget(&diff, max_size as u64);
313            assert!(
314                truncated.len() <= max_size,
315                "truncated diff exceeded max_size {} (got {}): {:?}",
316                max_size,
317                truncated.len(),
318                &truncated[..truncated.len().min(100)]
319            );
320        }
321    }
322
323    /// Test truncation with content exactly at boundary conditions.
324    #[test]
325    fn truncate_diff_boundary_content_sizes() {
326        for max_size in [50usize, 100, 200, 500] {
327            // Content exactly at max_size - should not truncate
328            let header = "diff --git a/a b/a\n+";
329            let exact_diff = format!(
330                "{}{}",
331                header,
332                "x".repeat(max_size.saturating_sub(header.len()))
333            );
334            if exact_diff.len() == max_size {
335                let (result, was_truncated) =
336                    truncate_diff_to_model_budget(&exact_diff, max_size as u64);
337                assert!(!was_truncated, "exact size should not trigger truncation");
338                assert_eq!(result.len(), max_size);
339            }
340
341            // Content one byte over max_size - should truncate
342            let over_diff = format!(
343                "{}{}",
344                header,
345                "x".repeat(max_size + 1 - header.len())
346            );
347            let (result, was_truncated) =
348                truncate_diff_to_model_budget(&over_diff, max_size as u64);
349            assert!(was_truncated, "over size should trigger truncation");
350            assert!(
351                result.len() <= max_size,
352                "truncated result {} should not exceed max_size {}",
353                result.len(),
354                max_size
355            );
356        }
357    }
358
359    /// Test that single-file diffs that exceed max_size are properly truncated.
360    #[test]
361    fn truncate_single_large_file_stays_within_budget() {
362        let max_size = 100usize;
363
364        // Single file that's way too big
365        let large_file = format!(
366            "diff --git a/src/big.rs b/src/big.rs\n+{}\n",
367            "x".repeat(max_size * 3)
368        );
369
370        let (truncated, was_truncated) =
371            truncate_diff_to_model_budget(&large_file, max_size as u64);
372        assert!(was_truncated, "large file should be truncated");
373        assert!(
374            truncated.len() <= max_size,
375            "single large file truncation {} exceeded max_size {}",
376            truncated.len(),
377            max_size
378        );
379    }
380
381    /// Test truncation with unicode content (multi-byte characters).
382    #[test]
383    fn truncate_diff_handles_unicode_boundaries() {
384        let max_size = 50usize;
385
386        // Unicode content: each emoji is 4 bytes
387        let emoji_line = "🎉".repeat(20); // 80 bytes
388        let diff = format!("diff --git a/a b/a\n+{}\n", emoji_line);
389
390        let (truncated, was_truncated) = truncate_diff_to_model_budget(&diff, max_size as u64);
391        assert!(was_truncated, "unicode diff should be truncated");
392        assert!(
393            truncated.len() <= max_size,
394            "unicode truncation {} exceeded max_size {}",
395            truncated.len(),
396            max_size
397        );
398        // Verify we didn't split a multi-byte character
399        assert!(
400            std::str::from_utf8(truncated.as_bytes()).is_ok(),
401            "truncated output should be valid UTF-8"
402        );
403    }
404
405    /// Test that empty diff is handled correctly.
406    #[test]
407    fn truncate_empty_diff() {
408        let (result, was_truncated) = truncate_diff_to_model_budget("", 100);
409        assert!(!was_truncated, "empty diff should not be truncated");
410        assert_eq!(result, "");
411    }
412
413    /// Test truncation with multiple small files.
414    #[test]
415    fn truncate_multiple_small_files_prefers_high_priority() {
416        let max_size = 200usize;
417
418        // Create multiple files with different priorities
419        let src_file = "diff --git a/src/main.rs b/src/main.rs\n+high priority\n";
420        let test_file = "diff --git a/tests/test.rs b/tests/test.rs\n+medium priority\n";
421        let doc_file = "diff --git a/README.md b/README.md\n+low priority docs\n";
422        let extra = "diff --git a/extra.rs b/extra.rs\n+extra content that exceeds budget\n";
423
424        let diff = format!("{doc_file}{test_file}{src_file}{extra}");
425
426        let (truncated, was_truncated) = truncate_diff_to_model_budget(&diff, max_size as u64);
427        assert!(was_truncated, "should truncate when files exceed budget");
428        assert!(
429            truncated.len() <= max_size,
430            "truncated {} exceeded max_size {}",
431            truncated.len(),
432            max_size
433        );
434        // High priority src file should be included before low priority docs
435        if truncated.contains("priority") {
436            assert!(
437                truncated.contains("high priority") || truncated.contains("medium priority"),
438                "should prioritize src/tests over docs"
439            );
440        }
441    }
442}