1use std::str;
9
10use tracing::warn;
11
12pub const CHARS_PER_TOKEN: usize = 4;
17
18pub fn estimate_tokens(char_count: usize) -> usize {
33 char_count / CHARS_PER_TOKEN
34}
35
36pub const MAX_OUTPUT_SIZE: usize = 10 * 1024 * 1024; pub fn truncate_output(data: &[u8], context: &str) -> String {
58 if data.len() > MAX_OUTPUT_SIZE {
59 warn!(
60 bytes = data.len(),
61 max = MAX_OUTPUT_SIZE,
62 "{context} output truncated to limit"
63 );
64 }
65 let truncated = &data[..data.len().min(MAX_OUTPUT_SIZE)];
66 match str::from_utf8(truncated) {
67 Ok(s) => s.trim_end().to_string(),
68 Err(_) => {
69 let cow = String::from_utf8_lossy(truncated);
70 cow.trim_end().to_string()
71 }
72 }
73}
74
75#[cfg(test)]
76mod tests {
77 use super::*;
78
79 #[test]
80 fn estimate_tokens_standard() {
81 assert_eq!(estimate_tokens(400), 100);
82 assert_eq!(estimate_tokens(800_000), 200_000);
83 }
84
85 #[test]
86 fn estimate_tokens_zero() {
87 assert_eq!(estimate_tokens(0), 0);
88 }
89
90 #[test]
91 fn estimate_tokens_rounds_down() {
92 assert_eq!(estimate_tokens(3), 0);
93 assert_eq!(estimate_tokens(5), 1);
94 assert_eq!(estimate_tokens(7), 1);
95 }
96
97 #[test]
98 fn small_input_returned_as_is() {
99 let data = b"hello world";
100 let result = truncate_output(data, "test");
101 assert_eq!(result, "hello world");
102 }
103
104 #[test]
105 fn input_exactly_at_max_output_size_returned_as_is() {
106 let data = vec![b'a'; MAX_OUTPUT_SIZE];
107 let result = truncate_output(&data, "test");
108 assert_eq!(result.len(), MAX_OUTPUT_SIZE);
109 assert!(result.chars().all(|c| c == 'a'));
110 }
111
112 #[test]
113 fn input_over_limit_gets_truncated() {
114 let data = vec![b'x'; MAX_OUTPUT_SIZE + 100];
115 let result = truncate_output(&data, "test");
116 assert_eq!(result.len(), MAX_OUTPUT_SIZE);
117 }
118
119 #[test]
120 fn way_over_limit_gets_truncated() {
121 let data = vec![b'z'; 20 * 1024 * 1024];
122 let result = truncate_output(&data, "test");
123 assert_eq!(result.len(), MAX_OUTPUT_SIZE);
124 }
125
126 #[test]
127 fn empty_input_returns_empty_string() {
128 let result = truncate_output(b"", "test");
129 assert_eq!(result, "");
130 }
131
132 #[test]
133 fn trailing_whitespace_trimmed() {
134 let data = b"hello \n\n ";
135 let result = truncate_output(data, "test");
136 assert_eq!(result, "hello");
137 }
138
139 #[test]
140 fn only_whitespace_returns_empty() {
141 let data = b" \n\t \r\n ";
142 let result = truncate_output(data, "test");
143 assert_eq!(result, "");
144 }
145
146 #[test]
147 fn invalid_utf8_uses_lossy_conversion() {
148 let data: &[u8] = &[0xFF, 0xFE, b'h', b'i'];
149 let result = truncate_output(data, "test");
150 assert!(result.contains('\u{FFFD}'));
151 assert!(result.contains("hi"));
152 }
153
154 #[test]
155 fn valid_multibyte_utf8_emoji_handled() {
156 let data = "\u{1F680} rocket".as_bytes();
157 let result = truncate_output(data, "test");
158 assert_eq!(result, "\u{1F680} rocket");
159 }
160
161 #[test]
162 fn truncation_splitting_multibyte_char_handled_via_lossy() {
163 let mut data = vec![b'a'; MAX_OUTPUT_SIZE - 1];
165 data.push(0xF0);
167 data.push(0x9F);
170 data.push(0x9A);
171 data.push(0x80);
172 let result = truncate_output(&data, "test");
173 assert!(result.contains('\u{FFFD}'));
175 }
176
177 #[test]
178 fn null_bytes_in_data() {
179 let data: &[u8] = &[b'a', 0x00, b'b', 0x00, b'c'];
180 let result = truncate_output(data, "test");
181 assert_eq!(result, "a\0b\0c");
182 }
183}