Skip to main content

ironflow_core/
utils.rs

1//! Utility functions for output handling and OOM protection.
2//!
3//! Shell commands and agent processes can produce arbitrarily large output.
4//! This module provides [`truncate_output`] to cap captured output at
5//! [`MAX_OUTPUT_SIZE`] bytes, preventing out-of-memory conditions in
6//! long-running workflows.
7
8use std::str;
9
10use tracing::warn;
11
12/// Average number of characters per token used for estimation.
13///
14/// This is a conservative heuristic (1 token ~ 4 chars for English text).
15/// The actual ratio varies by language and content type.
16pub const CHARS_PER_TOKEN: usize = 4;
17
18/// Estimate the number of tokens from a character count.
19///
20/// Uses a simple `chars / 4` heuristic. This is intentionally conservative
21/// (overestimates tokens for ASCII-heavy text, underestimates for CJK).
22///
23/// # Examples
24///
25/// ```
26/// use ironflow_core::utils::estimate_tokens;
27///
28/// assert_eq!(estimate_tokens(400), 100);
29/// assert_eq!(estimate_tokens(0), 0);
30/// assert_eq!(estimate_tokens(3), 0); // rounds down
31/// ```
32pub fn estimate_tokens(char_count: usize) -> usize {
33    char_count / CHARS_PER_TOKEN
34}
35
36/// Maximum number of bytes kept from a single process output stream (10 MB).
37///
38/// Any output beyond this limit is silently dropped after a warning is logged.
39pub const MAX_OUTPUT_SIZE: usize = 10 * 1024 * 1024; // 10 MB
40
41/// Truncate raw process output to at most [`MAX_OUTPUT_SIZE`] bytes and
42/// convert it to a UTF-8 [`String`].
43///
44/// If the data exceeds the limit a warning is emitted via [`tracing`] with
45/// the provided `context` label. Invalid UTF-8 sequences are replaced with
46/// the Unicode replacement character (U+FFFD). Trailing whitespace is trimmed.
47///
48/// # Examples
49///
50/// ```no_run
51/// use ironflow_core::utils::truncate_output;
52///
53/// let data = b"hello world\n";
54/// let output = truncate_output(data, "my-step");
55/// assert_eq!(output, "hello world");
56/// ```
57pub fn truncate_output(data: &[u8], context: &str) -> String {
58    if data.len() > MAX_OUTPUT_SIZE {
59        warn!(
60            bytes = data.len(),
61            max = MAX_OUTPUT_SIZE,
62            "{context} output truncated to limit"
63        );
64    }
65    let truncated = &data[..data.len().min(MAX_OUTPUT_SIZE)];
66    match str::from_utf8(truncated) {
67        Ok(s) => s.trim_end().to_string(),
68        Err(_) => {
69            let cow = String::from_utf8_lossy(truncated);
70            cow.trim_end().to_string()
71        }
72    }
73}
74
75#[cfg(test)]
76mod tests {
77    use super::*;
78
79    #[test]
80    fn estimate_tokens_standard() {
81        assert_eq!(estimate_tokens(400), 100);
82        assert_eq!(estimate_tokens(800_000), 200_000);
83    }
84
85    #[test]
86    fn estimate_tokens_zero() {
87        assert_eq!(estimate_tokens(0), 0);
88    }
89
90    #[test]
91    fn estimate_tokens_rounds_down() {
92        assert_eq!(estimate_tokens(3), 0);
93        assert_eq!(estimate_tokens(5), 1);
94        assert_eq!(estimate_tokens(7), 1);
95    }
96
97    #[test]
98    fn small_input_returned_as_is() {
99        let data = b"hello world";
100        let result = truncate_output(data, "test");
101        assert_eq!(result, "hello world");
102    }
103
104    #[test]
105    fn input_exactly_at_max_output_size_returned_as_is() {
106        let data = vec![b'a'; MAX_OUTPUT_SIZE];
107        let result = truncate_output(&data, "test");
108        assert_eq!(result.len(), MAX_OUTPUT_SIZE);
109        assert!(result.chars().all(|c| c == 'a'));
110    }
111
112    #[test]
113    fn input_over_limit_gets_truncated() {
114        let data = vec![b'x'; MAX_OUTPUT_SIZE + 100];
115        let result = truncate_output(&data, "test");
116        assert_eq!(result.len(), MAX_OUTPUT_SIZE);
117    }
118
119    #[test]
120    fn way_over_limit_gets_truncated() {
121        let data = vec![b'z'; 20 * 1024 * 1024];
122        let result = truncate_output(&data, "test");
123        assert_eq!(result.len(), MAX_OUTPUT_SIZE);
124    }
125
126    #[test]
127    fn empty_input_returns_empty_string() {
128        let result = truncate_output(b"", "test");
129        assert_eq!(result, "");
130    }
131
132    #[test]
133    fn trailing_whitespace_trimmed() {
134        let data = b"hello   \n\n  ";
135        let result = truncate_output(data, "test");
136        assert_eq!(result, "hello");
137    }
138
139    #[test]
140    fn only_whitespace_returns_empty() {
141        let data = b"   \n\t  \r\n  ";
142        let result = truncate_output(data, "test");
143        assert_eq!(result, "");
144    }
145
146    #[test]
147    fn invalid_utf8_uses_lossy_conversion() {
148        let data: &[u8] = &[0xFF, 0xFE, b'h', b'i'];
149        let result = truncate_output(data, "test");
150        assert!(result.contains('\u{FFFD}'));
151        assert!(result.contains("hi"));
152    }
153
154    #[test]
155    fn valid_multibyte_utf8_emoji_handled() {
156        let data = "\u{1F680} rocket".as_bytes();
157        let result = truncate_output(data, "test");
158        assert_eq!(result, "\u{1F680} rocket");
159    }
160
161    #[test]
162    fn truncation_splitting_multibyte_char_handled_via_lossy() {
163        // Create data at exactly MAX_OUTPUT_SIZE where the last bytes are a partial UTF-8 char.
164        let mut data = vec![b'a'; MAX_OUTPUT_SIZE - 1];
165        // Add the first 2 bytes of a 4-byte UTF-8 sequence (rocket emoji U+1F680 = F0 9F 9A 80)
166        data.push(0xF0);
167        // This is now MAX_OUTPUT_SIZE bytes, with a partial multibyte char at the end.
168        // Extend past the limit so truncation occurs mid-character.
169        data.push(0x9F);
170        data.push(0x9A);
171        data.push(0x80);
172        let result = truncate_output(&data, "test");
173        // The truncated partial sequence should be replaced with the replacement character.
174        assert!(result.contains('\u{FFFD}'));
175    }
176
177    #[test]
178    fn null_bytes_in_data() {
179        let data: &[u8] = &[b'a', 0x00, b'b', 0x00, b'c'];
180        let result = truncate_output(data, "test");
181        assert_eq!(result, "a\0b\0c");
182    }
183}