Skip to main content

ralph_workflow/phases/commit/
diff_truncation.rs

1/// Maximum safe prompt size in bytes before pre-truncation.
2const MAX_SAFE_PROMPT_SIZE: u64 = 200_000;
3
4/// Maximum prompt size for GLM-like agents (GLM, Zhipu, Qwen, `DeepSeek`).
5const GLM_MAX_PROMPT_SIZE: u64 = 100_000;
6
7/// Maximum prompt size for Claude-based agents.
8const CLAUDE_MAX_PROMPT_SIZE: u64 = 300_000;
9
10/// Get the maximum safe prompt size for a specific agent.
11#[must_use] 
12pub fn model_budget_bytes_for_agent_name(commit_agent: &str) -> u64 {
13    let agent_lower = commit_agent.to_lowercase();
14
15    if agent_lower.contains("glm")
16        || agent_lower.contains("zhipuai")
17        || agent_lower.contains("zai")
18        || agent_lower.contains("qwen")
19        || agent_lower.contains("deepseek")
20    {
21        GLM_MAX_PROMPT_SIZE
22    } else if agent_lower.contains("claude")
23        || agent_lower.contains("ccs")
24        || agent_lower.contains("anthropic")
25    {
26        CLAUDE_MAX_PROMPT_SIZE
27    } else {
28        MAX_SAFE_PROMPT_SIZE
29    }
30}
31
32#[must_use] 
33pub fn effective_model_budget_bytes(agent_names: &[String]) -> u64 {
34    agent_names
35        .iter()
36        .map(|name| model_budget_bytes_for_agent_name(name))
37        .min()
38        .unwrap_or(MAX_SAFE_PROMPT_SIZE)
39}
40
41/// Truncate diff if it's too large for agents with small context windows.
42fn truncate_diff_if_large(diff: &str, max_size: usize) -> String {
43    if diff.len() <= max_size {
44        return diff.to_string();
45    }
46
47    let mut files: Vec<DiffFile> = Vec::new();
48    let mut current_file = DiffFile::default();
49    let mut in_file = false;
50
51    for line in diff.lines() {
52        if line.starts_with("diff --git ") {
53            if in_file && !current_file.lines.is_empty() {
54                files.push(std::mem::take(&mut current_file));
55            }
56            in_file = true;
57            current_file.lines.push(line.to_string());
58
59            if let Some(path) = line.split(" b/").nth(1) {
60                current_file.path = path.to_string();
61                current_file.priority = prioritize_file_path(path);
62            }
63        } else if in_file {
64            current_file.lines.push(line.to_string());
65        }
66    }
67
68    if in_file && !current_file.lines.is_empty() {
69        files.push(current_file);
70    }
71
72    files.sort_by(|a, b| b.priority.cmp(&a.priority));
73
74    let mut result = String::new();
75    let mut current_size = 0;
76    let mut files_included = 0;
77    let total_files = files.len();
78
79    for file in &files {
80        let file_size: usize = file.lines.iter().map(|l| l.len() + 1).sum();
81
82        if current_size + file_size <= max_size {
83            for line in &file.lines {
84                result.push_str(line);
85                result.push('\n');
86            }
87            current_size += file_size;
88            files_included += 1;
89        } else if files_included == 0 {
90            let truncated_lines = truncate_lines_to_fit(&file.lines, max_size);
91            for line in truncated_lines {
92                result.push_str(&line);
93                result.push('\n');
94            }
95            files_included = 1;
96            break;
97        } else {
98            break;
99        }
100    }
101
102    if files_included < total_files {
103        let summary = format!(
104            "\n[Truncated: {files_included} of {total_files} files shown]\n"
105        );
106        if summary.len() <= max_size {
107            if result.len() + summary.len() > max_size {
108                let target_bytes = max_size.saturating_sub(summary.len());
109                if target_bytes < result.len() {
110                    let mut cut = 0usize;
111                    for (idx, _) in result.char_indices() {
112                        if idx > target_bytes {
113                            break;
114                        }
115                        cut = idx;
116                    }
117                    result.truncate(cut);
118                }
119            }
120            result.push_str(&summary);
121        }
122    }
123
124    result
125}
126
127#[must_use] 
128pub fn truncate_diff_to_model_budget(diff: &str, max_size_bytes: u64) -> (String, bool) {
129    let max_size = usize::try_from(max_size_bytes).unwrap_or(usize::MAX);
130    if diff.len() <= max_size {
131        (diff.to_string(), false)
132    } else {
133        (truncate_diff_if_large(diff, max_size), true)
134    }
135}
136
137#[derive(Default)]
138struct DiffFile {
139    path: String,
140    priority: i32,
141    lines: Vec<String>,
142}
143
144fn prioritize_file_path(path: &str) -> i32 {
145    let normalized = path.replace('\\', "/");
146    let parts: Vec<&str> = normalized.split('/').filter(|p| !p.is_empty()).collect();
147
148    if parts.contains(&"src") {
149        100
150    } else if parts.contains(&"tests") {
151        50
152    } else if std::path::Path::new(&normalized)
153        .extension()
154        .is_some_and(|ext| ext.eq_ignore_ascii_case("md") || ext.eq_ignore_ascii_case("txt"))
155    {
156        10
157    } else {
158        0
159    }
160}
161
162fn truncate_to_utf8_boundary(s: &mut String, max_bytes: usize) {
163    if s.len() <= max_bytes {
164        return;
165    }
166    let mut cut = 0usize;
167    for (idx, _) in s.char_indices() {
168        if idx > max_bytes {
169            break;
170        }
171        cut = idx;
172    }
173    s.truncate(cut);
174}
175
176fn truncate_lines_to_fit(lines: &[String], max_size: usize) -> Vec<String> {
177    let mut result = Vec::new();
178    let mut current_size = 0;
179
180    for line in lines {
181        let line_size = line.len() + 1;
182        if current_size + line_size <= max_size {
183            current_size += line_size;
184            result.push(line.clone());
185        } else {
186            break;
187        }
188    }
189
190    let suffix = " [truncated...]";
191    let suffix_len = suffix.len();
192
193    if !result.is_empty() {
194        // current_size tracks line lengths + '\n' for each included line.
195        // Appending the suffix increases size; ensure we stay within max_size
196        // by trimming from the last included line if needed.
197        let mut total_size = current_size;
198        while !result.is_empty() && total_size + suffix_len > max_size {
199            let last_len = result.last().expect("checked non-empty").len();
200            let excess = total_size + suffix_len - max_size;
201            if excess < last_len {
202                let new_len = last_len - excess;
203                let last = result.last_mut().expect("checked non-empty");
204                truncate_to_utf8_boundary(last, new_len);
205                break;
206            }
207            // Can't trim enough from last line; drop it and retry.
208            let dropped = result.pop().expect("checked non-empty");
209            total_size = total_size.saturating_sub(dropped.len() + 1);
210        }
211
212        if let Some(last) = result.last_mut() {
213            last.push_str(suffix);
214        }
215    }
216
217    result
218}
219
220#[cfg(test)]
221mod diff_truncation_tests {
222    use super::*;
223
224    #[test]
225    fn prioritize_file_path_handles_crate_prefixed_paths() {
226        // Real diffs in this repo often include crate-prefixed paths like `ralph-workflow/src/...`.
227        // These should still be treated as high-priority source changes.
228        assert_eq!(prioritize_file_path("ralph-workflow/src/lib.rs"), 100);
229        assert_eq!(prioritize_file_path("ralph-workflow/tests/integration.rs"), 50);
230        assert_eq!(prioritize_file_path("README.md"), 10);
231    }
232
233    #[test]
234    fn truncate_diff_to_model_budget_never_exceeds_max_size() {
235        let files_included = 1;
236        let total_files = 2;
237        let summary = format!(
238            "\n[Truncated: {files_included} of {total_files} files shown]\n"
239        );
240
241        let max_size = 1_000usize;
242
243        // Craft a diff where:
244        // - file 1 fits within max_size
245        // - file 2 does not fit, so a truncation summary is appended
246        // - file 1 content is sized so adding summary would exceed max_size
247        let file1_header = "diff --git a/src/a.rs b/src/a.rs";
248        let desired_file1_size = max_size - summary.len() + 1;
249        let filler_line_len = desired_file1_size.saturating_sub(file1_header.len() + 2);
250        let file1 = format!(
251            "{file1_header}\n+{}\n",
252            "x".repeat(filler_line_len.saturating_sub(1))
253        );
254
255        let file2 = "diff --git a/tests/b.rs b/tests/b.rs\n+small\n";
256        let diff = format!("{file1}{file2}");
257
258        let (truncated, was_truncated) = truncate_diff_to_model_budget(&diff, max_size as u64);
259        assert!(was_truncated, "expected truncation when diff exceeds max size");
260        assert!(
261            truncated.len() <= max_size,
262            "truncated diff must not exceed max_size (got {} > {})",
263            truncated.len(),
264            max_size
265        );
266    }
267
268    #[test]
269    fn truncate_lines_to_fit_reserves_space_for_truncation_suffix() {
270        // Regression test: truncate_lines_to_fit() used to append " [truncated...]" after
271        // selecting lines that fit max_size, which could push the final output over the
272        // intended max_size budget.
273        let max_size = 20usize;
274        let lines = vec!["x".repeat(max_size - 1)];
275
276        let truncated = truncate_lines_to_fit(&lines, max_size);
277
278        let total_size: usize = truncated.iter().map(|l| l.len() + 1).sum();
279        assert!(
280            total_size <= max_size,
281            "truncate_lines_to_fit must not exceed max_size after adding suffix (got {total_size} > {max_size})"
282        );
283    }
284
285    // =========================================================================
286    // Exhaustive edge case tests for truncation invariants
287    // =========================================================================
288
289    /// Test that truncation output never exceeds `max_size` for various edge cases.
290    ///
291    /// This exhaustively tests boundary conditions around the truncation summary
292    /// appending logic to ensure the invariant "`output.len()` <= `max_size`" holds.
293    #[test]
294    fn truncate_diff_invariant_never_exceeds_max_size_edge_cases() {
295        // Test various max_size values around the summary length
296        let summary_len = "\n[Truncated: 1 of 2 files shown]\n".len();
297
298        for max_size in [
299            10,                 // Very small
300            summary_len - 1,    // Just under summary
301            summary_len,        // Exactly summary
302            summary_len + 1,    // Just over summary
303            summary_len + 10,   // Summary + small content
304            100,                // Reasonable small size
305            1000,               // Reasonable larger size
306        ] {
307            let file1 = format!(
308                "diff --git a/src/a.rs b/src/a.rs\n+{}\n",
309                "x".repeat(max_size)
310            );
311            let file2 = "diff --git a/tests/b.rs b/tests/b.rs\n+extra\n";
312            let diff = format!("{file1}{file2}");
313
314            let (truncated, _) = truncate_diff_to_model_budget(&diff, max_size as u64);
315            assert!(
316                truncated.len() <= max_size,
317                "truncated diff exceeded max_size {} (got {}): {:?}",
318                max_size,
319                truncated.len(),
320                &truncated[..truncated.len().min(100)]
321            );
322        }
323    }
324
325    /// Test truncation with content exactly at boundary conditions.
326    #[test]
327    fn truncate_diff_boundary_content_sizes() {
328        for max_size in [50usize, 100, 200, 500] {
329            // Content exactly at max_size - should not truncate
330            let header = "diff --git a/a b/a\n+";
331            let exact_diff = format!(
332                "{}{}",
333                header,
334                "x".repeat(max_size.saturating_sub(header.len()))
335            );
336            if exact_diff.len() == max_size {
337                let (result, was_truncated) =
338                    truncate_diff_to_model_budget(&exact_diff, max_size as u64);
339                assert!(!was_truncated, "exact size should not trigger truncation");
340                assert_eq!(result.len(), max_size);
341            }
342
343            // Content one byte over max_size - should truncate
344            let over_diff = format!(
345                "{}{}",
346                header,
347                "x".repeat(max_size + 1 - header.len())
348            );
349            let (result, was_truncated) =
350                truncate_diff_to_model_budget(&over_diff, max_size as u64);
351            assert!(was_truncated, "over size should trigger truncation");
352            assert!(
353                result.len() <= max_size,
354                "truncated result {} should not exceed max_size {}",
355                result.len(),
356                max_size
357            );
358        }
359    }
360
361    /// Test that single-file diffs that exceed `max_size` are properly truncated.
362    #[test]
363    fn truncate_single_large_file_stays_within_budget() {
364        let max_size = 100usize;
365
366        // Single file that's way too big
367        let large_file = format!(
368            "diff --git a/src/big.rs b/src/big.rs\n+{}\n",
369            "x".repeat(max_size * 3)
370        );
371
372        let (truncated, was_truncated) =
373            truncate_diff_to_model_budget(&large_file, max_size as u64);
374        assert!(was_truncated, "large file should be truncated");
375        assert!(
376            truncated.len() <= max_size,
377            "single large file truncation {} exceeded max_size {}",
378            truncated.len(),
379            max_size
380        );
381    }
382
383    /// Test truncation with unicode content (multi-byte characters).
384    #[test]
385    fn truncate_diff_handles_unicode_boundaries() {
386        let max_size = 50usize;
387
388        // Unicode content: each emoji is 4 bytes
389        let emoji_line = "🎉".repeat(20); // 80 bytes
390        let diff = format!("diff --git a/a b/a\n+{emoji_line}\n");
391
392        let (truncated, was_truncated) = truncate_diff_to_model_budget(&diff, max_size as u64);
393        assert!(was_truncated, "unicode diff should be truncated");
394        assert!(
395            truncated.len() <= max_size,
396            "unicode truncation {} exceeded max_size {}",
397            truncated.len(),
398            max_size
399        );
400        // Verify we didn't split a multi-byte character
401        assert!(
402            std::str::from_utf8(truncated.as_bytes()).is_ok(),
403            "truncated output should be valid UTF-8"
404        );
405    }
406
407    /// Test that empty diff is handled correctly.
408    #[test]
409    fn truncate_empty_diff() {
410        let (result, was_truncated) = truncate_diff_to_model_budget("", 100);
411        assert!(!was_truncated, "empty diff should not be truncated");
412        assert_eq!(result, "");
413    }
414
415    /// Test truncation with multiple small files.
416    #[test]
417    fn truncate_multiple_small_files_prefers_high_priority() {
418        let max_size = 200usize;
419
420        // Create multiple files with different priorities
421        let src_file = "diff --git a/src/main.rs b/src/main.rs\n+high priority\n";
422        let test_file = "diff --git a/tests/test.rs b/tests/test.rs\n+medium priority\n";
423        let doc_file = "diff --git a/README.md b/README.md\n+low priority docs\n";
424        let extra = "diff --git a/extra.rs b/extra.rs\n+extra content that exceeds budget\n";
425
426        let diff = format!("{doc_file}{test_file}{src_file}{extra}");
427
428        let (truncated, was_truncated) = truncate_diff_to_model_budget(&diff, max_size as u64);
429        assert!(was_truncated, "should truncate when files exceed budget");
430        assert!(
431            truncated.len() <= max_size,
432            "truncated {} exceeded max_size {}",
433            truncated.len(),
434            max_size
435        );
436        // High priority src file should be included before low priority docs
437        if truncated.contains("priority") {
438            assert!(
439                truncated.contains("high priority") || truncated.contains("medium priority"),
440                "should prioritize src/tests over docs"
441            );
442        }
443    }
444}