ralph-workflow 0.7.18

/// Maximum safe prompt size in bytes before pre-truncation.
pub const MAX_SAFE_PROMPT_SIZE: u64 = 200_000;

use itertools::Itertools;

/// Maximum prompt size for GLM-like agents (GLM, Zhipu, Qwen, `DeepSeek`).
pub const GLM_MAX_PROMPT_SIZE: u64 = 100_000;

/// Maximum prompt size for Claude-based agents.
pub const CLAUDE_MAX_PROMPT_SIZE: u64 = 300_000;

/// Get the maximum safe prompt size for a specific agent.
#[must_use]
pub fn model_budget_bytes_for_agent_name(commit_agent: &str) -> u64 {
    let agent_lower = commit_agent.to_lowercase();

    if agent_lower.contains("glm")
        || agent_lower.contains("zhipuai")
        || agent_lower.contains("zai")
        || agent_lower.contains("qwen")
        || agent_lower.contains("deepseek")
    {
        GLM_MAX_PROMPT_SIZE
    } else if agent_lower.contains("claude")
        || agent_lower.contains("ccs")
        || agent_lower.contains("anthropic")
    {
        CLAUDE_MAX_PROMPT_SIZE
    } else {
        MAX_SAFE_PROMPT_SIZE
    }
}

#[must_use]
pub fn effective_model_budget_bytes(agent_names: &[String]) -> u64 {
    agent_names
        .iter()
        .map(|name| model_budget_bytes_for_agent_name(name))
        .min()
        .unwrap_or(MAX_SAFE_PROMPT_SIZE)
}

/// Truncate diff if it's too large for agents with small context windows.
pub fn truncate_diff_if_large(diff: &str, max_size: usize) -> String {
    if diff.len() <= max_size {
        return diff.to_string();
    }

    #[derive(Default)]
    struct Accumulator {
        files: Vec<DiffFile>,
        current_file: Option<DiffFile>,
    }

    impl Accumulator {
        fn process_line(self, line: &str) -> Self {
            if line.starts_with("diff --git ") {
                let priority = line
                    .split_once(" b/")
                    .map(|(_, after)| prioritize_file_path(after))
                    .unwrap_or_default();
                let new_file = DiffFile {
                    priority,
                    lines: vec![line.to_string()],
                };
                // Finalize the previous file and start a new one.
                let finalized_files: Vec<DiffFile> = self
                    .files
                    .into_iter()
                    .chain(
                        self.current_file
                            .into_iter()
                            .filter(|f| !f.lines.is_empty()),
                    )
                    .collect();
                Self {
                    files: finalized_files,
                    current_file: Some(new_file),
                }
            } else {
                match self.current_file {
                    Some(current) => Self {
                        files: self.files,
                        current_file: Some(DiffFile {
                            priority: current.priority,
                            lines: current
                                .lines
                                .into_iter()
                                .chain(std::iter::once(line.to_string()))
                                .collect(),
                        }),
                    },
                    None => self,
                }
            }
        }
    }

    let final_acc = diff
        .lines()
        .fold(Accumulator::default(), |acc, line| acc.process_line(line));

    let sorted_files: Vec<_> = {
        let Accumulator {
            files,
            current_file,
        } = final_acc;
        files
            .into_iter()
            .chain(current_file.into_iter().filter(|f| !f.lines.is_empty()))
            .sorted_by_key(|f| -f.priority)
            .collect()
    };

    let total_files = sorted_files.len();

    let (result, files_included) = {
        let file_data: Vec<_> = sorted_files
            .iter()
            .map(|file| {
                let lines_text: String = file.lines.iter().map(|l| format!("{l}\n")).collect();
                (lines_text.clone(), lines_text.len())
            })
            .collect();

        let total_chunks_len: usize = file_data.iter().map(|(_, len)| *len).sum();
        if total_chunks_len <= max_size {
            let result = file_data
                .iter()
                .map(|(t, _)| t.clone())
                .collect::<Vec<_>>()
                .join("");
            return result;
        }

        let cumulative_sizes: Vec<_> = file_data
            .iter()
            .scan(0usize, |acc, (_, len)| {
                *acc += len;
                Some(*acc)
            })
            .collect();

        let last_fitting_index = cumulative_sizes
            .iter()
            .position(|&size| size > max_size)
            .unwrap_or(file_data.len());

        if last_fitting_index == 0 {
            let truncated = truncate_lines_to_fit(&sorted_files[0].lines, max_size);
            let truncated_text: String = truncated.iter().map(|l| format!("{l}\n")).collect();
            (truncated_text, 1)
        } else {
            let included: Vec<_> = file_data
                .iter()
                .take(last_fitting_index)
                .map(|(t, _)| t.clone())
                .collect();
            (included.join(""), included.len())
        }
    };

    let was_truncated = result.len() < diff.len();

    if files_included < total_files || was_truncated {
        let summary = format!("\n[Truncated: {files_included} of {total_files} files shown]\n");
        if summary.len() <= max_size {
            if result.len() + summary.len() > max_size {
                let target_bytes = max_size.saturating_sub(summary.len());
                if target_bytes < result.len() {
                    let cut = result
                        .char_indices()
                        .take_while(|(idx, _)| *idx <= target_bytes)
                        .last()
                        .map(|(idx, _)| idx)
                        .unwrap_or(0);
                    return format!("{}{}", &result[..cut], summary);
                }
            }
            return format!("{result}{summary}");
        }
    }

    result
}

#[must_use]
pub fn truncate_diff_to_model_budget(diff: &str, max_size_bytes: u64) -> (String, bool) {
    let max_size = usize::try_from(max_size_bytes).unwrap_or(usize::MAX);
    if diff.len() <= max_size {
        (diff.to_string(), false)
    } else {
        (truncate_diff_if_large(diff, max_size), true)
    }
}

#[derive(Default)]
struct DiffFile {
    priority: i32,
    lines: Vec<String>,
}

fn prioritize_file_path(path: &str) -> i32 {
    let normalized = path.replace('\\', "/");
    let parts: Vec<&str> = normalized.split('/').filter(|p| !p.is_empty()).collect();

    if parts.contains(&"src") {
        100
    } else if parts.contains(&"tests") {
        50
    } else if std::path::Path::new(&normalized)
        .extension()
        .is_some_and(|ext| ext.eq_ignore_ascii_case("md") || ext.eq_ignore_ascii_case("txt"))
    {
        10
    } else {
        0
    }
}

fn truncate_to_utf8_boundary(s: &str, max_bytes: usize) -> String {
    if s.len() <= max_bytes {
        return s.to_string();
    }
    let cut = s
        .char_indices()
        .take_while(|(idx, _)| *idx <= max_bytes)
        .last()
        .map(|(idx, _)| idx)
        .unwrap_or(0);
    s[..cut].to_string()
}

pub fn truncate_lines_to_fit(lines: &[String], max_size: usize) -> Vec<String> {
    let suffix = " [truncated...]";
    let suffix_len = suffix.len();

    if lines.is_empty() {
        return Vec::new();
    }

    let line_sizes: Vec<usize> = lines.iter().map(|l| l.len() + 1).collect();
    let total_size: usize = line_sizes.iter().sum();

    if total_size <= max_size {
        return lines.to_vec();
    }

    let available_for_lines = max_size.saturating_sub(suffix_len);

    let result: Vec<_> = lines
        .iter()
        .scan(0usize, |size, line| {
            let line_size = line.len() + 1;
            if *size + line_size <= available_for_lines {
                *size += line_size;
                Some(line.clone())
            } else {
                None
            }
        })
        .collect();

    if result.is_empty() {
        return result;
    }

    let current_size: usize = result.iter().map(|l| l.len() + 1).sum();

    let adjusted: Vec<String> = if current_size + suffix_len > max_size {
        let target_bytes = max_size.saturating_sub(suffix_len);
        // Scan in reverse to collect lines that fit within target_bytes.
        // The scan state is (accumulated_so_far, still_accepting).
        // When a line doesn't fit and nothing has been accumulated yet,
        // we truncate that line to the UTF-8 boundary and stop.
        result
            .iter()
            .rev()
            .scan((0usize, true), |(accumulated, accepting), line| {
                if !*accepting {
                    return None;
                }
                let line_size = line.len() + 1;
                if *accumulated + line_size <= target_bytes {
                    *accumulated += line_size;
                    Some(Some(line.clone()))
                } else if *accumulated == 0 {
                    // First line already too big: truncate it.
                    *accepting = false;
                    let max_for_line = target_bytes.saturating_sub(1);
                    let new_line = truncate_to_utf8_boundary(line, max_for_line);
                    if new_line.is_empty() {
                        Some(None)
                    } else {
                        Some(Some(new_line))
                    }
                } else {
                    *accepting = false;
                    None
                }
            })
            .flatten()
            .collect::<Vec<_>>()
            .into_iter()
            .rev()
            .collect()
    } else {
        result
    };

    if adjusted.is_empty() {
        adjusted
    } else {
        let last_idx = adjusted.len() - 1;
        let (init, last) = adjusted.split_at(last_idx);
        let last_formatted = format!("{}{}", last[0], suffix);
        init.iter()
            .map(String::clone)
            .chain(std::iter::once(last_formatted))
            .collect()
    }
}

#[cfg(test)]
mod diff_truncation_tests {
    use super::*;

    #[test]
    fn prioritize_file_path_handles_crate_prefixed_paths() {
        assert_eq!(prioritize_file_path("ralph-workflow/src/lib.rs"), 100);
        assert_eq!(
            prioritize_file_path("ralph-workflow/tests/integration.rs"),
            50
        );
        assert_eq!(prioritize_file_path("README.md"), 10);
    }

    #[test]
    fn truncate_diff_to_model_budget_never_exceeds_max_size() {
        let files_included = 1;
        let total_files = 2;
        let summary = format!("\n[Truncated: {files_included} of {total_files} files shown]\n");

        let max_size = 1_000usize;

        let file1_header = "diff --git a/src/a.rs b/src/a.rs";
        let desired_file1_size = max_size - summary.len() + 1;
        let filler_line_len = desired_file1_size.saturating_sub(file1_header.len() + 2);
        let file1 = format!(
            "{file1_header}\n+{}\n",
            "x".repeat(filler_line_len.saturating_sub(1))
        );

        let file2 = "diff --git a/tests/b.rs b/tests/b.rs\n+small\n";
        let diff = format!("{file1}{file2}");

        let (truncated, was_truncated) = truncate_diff_to_model_budget(&diff, max_size as u64);
        assert!(
            was_truncated,
            "expected truncation when diff exceeds max size"
        );
        assert!(
            truncated.len() <= max_size,
            "truncated diff must not exceed max_size (got {} > {})",
            truncated.len(),
            max_size
        );
    }

    #[test]
    fn truncate_lines_to_fit_reserves_space_for_truncation_suffix() {
        let max_size = 20usize;
        let lines = vec!["x".repeat(max_size - 1)];

        let truncated = truncate_lines_to_fit(&lines, max_size);

        let total_size: usize = truncated.iter().map(|l| l.len() + 1).sum();
        assert!(
            total_size <= max_size,
            "truncate_lines_to_fit must not exceed max_size after adding suffix (got {total_size} > {max_size})"
        );
    }

    #[test]
    fn truncate_diff_invariant_never_exceeds_max_size_edge_cases() {
        let summary_len = "\n[Truncated: 1 of 2 files shown]\n".len();

        [
            10,
            summary_len - 1,
            summary_len,
            summary_len + 1,
            summary_len + 10,
            100,
            1000,
        ]
        .into_iter()
        .for_each(|max_size| {
            let file1 = format!(
                "diff --git a/src/a.rs b/src/a.rs\n+{}\n",
                "x".repeat(max_size)
            );
            let file2 = "diff --git a/tests/b.rs b/tests/b.rs\n+extra\n";
            let diff = format!("{file1}{file2}");

            let (truncated, _) = truncate_diff_to_model_budget(&diff, max_size as u64);
            assert!(
                truncated.len() <= max_size,
                "truncated diff exceeded max_size {} (got {}): {:?}",
                max_size,
                truncated.len(),
                &truncated[..truncated.len().min(100)]
            );
        });
    }

    #[test]
    fn truncate_diff_boundary_content_sizes() {
        [50usize, 100, 200, 500].into_iter().for_each(|max_size| {
            let header = "diff --git a/a b/a\n+";
            let exact_diff = format!(
                "{}{}",
                header,
                "x".repeat(max_size.saturating_sub(header.len()))
            );
            if exact_diff.len() == max_size {
                let (result, was_truncated) =
                    truncate_diff_to_model_budget(&exact_diff, max_size as u64);
                assert!(!was_truncated, "exact size should not trigger truncation");
                assert_eq!(result.len(), max_size);
            }

            let over_diff = format!("{}{}", header, "x".repeat(max_size + 1 - header.len()));
            let (result, was_truncated) =
                truncate_diff_to_model_budget(&over_diff, max_size as u64);
            assert!(was_truncated, "over size should trigger truncation");
            assert!(
                result.len() <= max_size,
                "truncated result {} should not exceed max_size {}",
                result.len(),
                max_size
            );
        });
    }

    #[test]
    fn truncate_single_large_file_stays_within_budget() {
        let max_size = 100usize;

        let large_file = format!(
            "diff --git a/src/big.rs b/src/big.rs\n+{}\n",
            "x".repeat(max_size * 3)
        );

        let (truncated, was_truncated) =
            truncate_diff_to_model_budget(&large_file, max_size as u64);
        assert!(was_truncated, "large file should be truncated");
        assert!(
            truncated.len() <= max_size,
            "single large file truncation {} exceeded max_size {}",
            truncated.len(),
            max_size
        );
    }

    #[test]
    fn truncate_diff_handles_unicode_boundaries() {
        let max_size = 50usize;

        let emoji_line = "🎉".repeat(20);
        let diff = format!("diff --git a/a b/a\n+{emoji_line}\n");

        let (truncated, was_truncated) = truncate_diff_to_model_budget(&diff, max_size as u64);
        assert!(was_truncated, "unicode diff should be truncated");
        assert!(
            truncated.len() <= max_size,
            "unicode truncation {} exceeded max_size {}",
            truncated.len(),
            max_size
        );
        assert!(
            std::str::from_utf8(truncated.as_bytes()).is_ok(),
            "truncated output should be valid UTF-8"
        );
    }

    #[test]
    fn truncate_diff_preserves_header_only_file() {
        let diff = "diff --git a/src/existing.rs b/src/existing.rs\n+line\n".to_string();
        let header_only = "diff --git a/src/header_only.rs b/src/header_only.rs\n";
        let combined = format!("{diff}{header_only}");

        let (result, was_truncated) = truncate_diff_to_model_budget(&combined, 512);
        assert!(!was_truncated, "diff should fit within budget");
        assert!(
            result.contains(header_only),
            "header-only file should still be present"
        );
    }

    #[test]
    fn truncate_empty_diff() {
        let (result, was_truncated) = truncate_diff_to_model_budget("", 100);
        assert!(!was_truncated, "empty diff should not be truncated");
        assert_eq!(result, "");
    }

    #[test]
    fn truncate_multiple_small_files_prefers_high_priority() {
        let max_size = 200usize;

        let src_file = "diff --git a/src/main.rs b/src/main.rs\n+high priority\n";
        let test_file = "diff --git a/tests/test.rs b/tests/test.rs\n+medium priority\n";
        let doc_file = "diff --git a/README.md b/README.md\n+low priority docs\n";
        let extra = "diff --git a/extra.rs b/extra.rs\n+extra content that exceeds budget\n";

        let diff = format!("{doc_file}{test_file}{src_file}{extra}");

        let (truncated, was_truncated) = truncate_diff_to_model_budget(&diff, max_size as u64);
        assert!(was_truncated, "should truncate when files exceed budget");
        assert!(
            truncated.len() <= max_size,
            "truncated {} exceeded max_size {}",
            truncated.len(),
            max_size
        );
        if truncated.contains("priority") {
            assert!(
                truncated.contains("high priority") || truncated.contains("medium priority"),
                "should prioritize src/tests over docs"
            );
        }
    }
}