infiniloom_engine/content_processing.rs
1//! Content processing utilities for transforming file contents
2//!
3//! This module provides utilities for processing and transforming file content,
4//! particularly for optimizing content for LLM consumption by removing or
5//! truncating large binary/encoded data.
6//!
7//! # Features
8//!
9//! - **Base64 Detection and Truncation**: Automatically detects and truncates
10//! base64-encoded content (data URIs, embedded images, etc.) to save tokens
11//! - **Pattern-based Processing**: Uses pre-compiled regex patterns for
12//! efficient content transformation
13//!
14//! # Examples
15//!
16//! ## Truncating Base64 Content
17//!
18//! ```rust
19//! use infiniloom_engine::content_processing::truncate_base64;
20//!
21//! // Data URI with embedded image
22//! let content = "...";
23//! let truncated = truncate_base64(content);
24//! assert!(truncated.contains("[BASE64_TRUNCATED]"));
25//!
26//! // Regular text is preserved
27//! let text = "This is normal text";
28//! let result = truncate_base64(text);
29//! assert_eq!(result, text);
30//! ```
31//!
32//! # Performance
33//!
34//! - Uses `once_cell::sync::Lazy` for one-time regex compilation
35//! - Regex patterns are compiled once and reused across all calls
36//! - Efficient for processing large codebases with many files
37//!
38//! # Detection Rules
39//!
40//! The base64 detection looks for:
41//! - **Data URIs**: `data:[mimetype];base64,[content]`
42//! - **Long base64 strings**: Sequences of 200+ base64 characters
43//!
44//! Truncation behavior:
45//! - Data URIs: Preserves prefix, replaces content with `[BASE64_TRUNCATED]`
46//! - Long strings (>100 chars with +/): Shows first 50 chars + `...[BASE64_TRUNCATED]`
47//! - Short strings (<200 chars): Not truncated
48//! - Non-base64 text: Preserved unchanged
49
50use once_cell::sync::Lazy;
51use regex::Regex;
52
53/// Pre-compiled regex pattern for detecting base64 content
54///
55/// Matches:
56/// - `data:[mimetype];base64,[base64-content]` (data URIs)
57/// - Sequences of 200+ base64 characters (likely embedded data)
58///
59/// The pattern uses `[A-Za-z0-9+/]*={0,2}` to match valid base64 characters
60/// with optional padding (0-2 `=` characters at the end).
61static BASE64_PATTERN: Lazy<Regex> = Lazy::new(|| {
62 Regex::new(r"data:[^;]+;base64,[A-Za-z0-9+/]*={0,2}|[A-Za-z0-9+/]{200,}={0,2}").unwrap()
63});
64
65/// Truncate base64-encoded content in text to save tokens
66///
67/// This function detects and truncates large base64-encoded content (such as
68/// embedded images in data URIs or long base64 strings) to reduce token count
69/// while preserving the structure and meaning of the text.
70///
71/// # Arguments
72///
73/// * `content` - The text content to process
74///
75/// # Returns
76///
77/// A new string with base64 content truncated and replaced with markers.
78///
79/// # Detection and Truncation Rules
80///
81/// 1. **Data URIs** (e.g., `...`):
82/// - Preserves the MIME type prefix: `data:image/png;base64,`
83/// - Replaces the base64 content with: `[BASE64_TRUNCATED]`
84/// - Result: `data:image/png;base64,[BASE64_TRUNCATED]`
85///
86/// 2. **Long base64 strings** (200+ characters with `+` or `/`):
87/// - Shows first 50 characters
88/// - Appends: `...[BASE64_TRUNCATED]`
89/// - Result: `SGVsbG8gV29...ybGQ=...[BASE64_TRUNCATED]`
90///
91/// 3. **Short base64 strings** (<200 characters):
92/// - Not truncated (kept as-is)
93///
94/// 4. **Long strings without base64 characters** (no `+` or `/`):
95/// - Not truncated (likely not base64)
96///
97/// 5. **Regular text**:
98/// - Completely preserved
99///
100/// # Examples
101///
102/// ```rust
103/// use infiniloom_engine::content_processing::truncate_base64;
104///
105/// // Data URI truncation
106/// let data_uri = "...";
107/// let result = truncate_base64(data_uri);
108/// assert_eq!(result, "data:image/png;base64,[BASE64_TRUNCATED]");
109///
110/// // Long base64 string truncation
111/// let long_base64 = "A".repeat(250) + "+/";
112/// let result = truncate_base64(&long_base64);
113/// assert!(result.contains("[BASE64_TRUNCATED]"));
114///
115/// // Short base64 preserved
116/// let short = "SGVsbG8gV29ybGQ="; // "Hello World" in base64 (16 chars)
117/// let result = truncate_base64(short);
118/// assert_eq!(result, short); // Unchanged
119///
120/// // Regular text preserved
121/// let text = "This is regular code with no base64";
122/// let result = truncate_base64(text);
123/// assert_eq!(result, text); // Unchanged
124/// ```
125///
126/// # Performance
127///
128/// - Uses pre-compiled regex pattern (compiled once, reused forever)
129/// - Efficient replacement with `Regex::replace_all()`
130/// - Only allocates new string if matches are found
131///
132/// # Use Cases
133///
134/// - Reducing token count when packing repositories with embedded images
135/// - Removing large data URIs from HTML/CSS files
136/// - Truncating base64-encoded assets in configuration files
137/// - Optimizing content for LLM context windows
138pub fn truncate_base64(content: &str) -> String {
139 BASE64_PATTERN
140 .replace_all(content, |caps: ®ex::Captures<'_>| {
141 let matched = caps.get(0).map_or("", |m| m.as_str());
142
143 // Handle data URIs (preserve MIME type prefix)
144 if matched.starts_with("data:") {
145 if let Some(comma_idx) = matched.find(',') {
146 let prefix = &matched[..comma_idx + 1];
147 format!("{}[BASE64_TRUNCATED]", prefix)
148 } else {
149 "[BASE64_TRUNCATED]".to_owned()
150 }
151 }
152 // Handle long base64 strings (200+ chars)
153 else if matched.len() > 100 {
154 // Only truncate if it looks like base64 (has + or /)
155 if matched.contains('+') || matched.contains('/') {
156 format!("{}...[BASE64_TRUNCATED]", &matched[..50])
157 } else {
158 // Might not be base64, keep as-is
159 matched.to_owned()
160 }
161 }
162 // Short strings: keep as-is
163 else {
164 matched.to_owned()
165 }
166 })
167 .to_string()
168}
169
170#[cfg(test)]
171mod tests {
172 use super::*;
173
174 // ============================================
175 // truncate_base64 Tests
176 // ============================================
177
178 #[test]
179 fn test_truncate_base64_data_uri() {
180 let input = "";
181 let result = truncate_base64(input);
182 assert!(result.contains("data:image/png;base64,"));
183 assert!(result.contains("[BASE64_TRUNCATED]"));
184 }
185
186 #[test]
187 fn test_truncate_base64_long_string() {
188 // Long base64 string with + and / characters
189 let input = "A".repeat(150) + "+" + &"B".repeat(100) + "/";
190 let result = truncate_base64(&input);
191 assert!(result.contains("[BASE64_TRUNCATED]") || result.len() == input.len());
192 }
193
194 #[test]
195 fn test_truncate_base64_no_truncation_short() {
196 let input = "SGVsbG8gV29ybGQ="; // "Hello World" in base64
197 let result = truncate_base64(input);
198 // Short strings are not truncated
199 assert_eq!(result, input);
200 }
201
202 #[test]
203 fn test_truncate_base64_preserves_non_base64() {
204 let input = "This is regular text without base64";
205 let result = truncate_base64(input);
206 assert_eq!(result, input);
207 }
208
209 #[test]
210 fn test_truncate_base64_multiple_data_uris() {
211 let input = " and ";
212 let result = truncate_base64(input);
213 assert!(result.contains("data:image/png;base64,[BASE64_TRUNCATED]"));
214 assert!(result.contains("data:image/jpeg;base64,[BASE64_TRUNCATED]"));
215 }
216
217 #[test]
218 fn test_truncate_base64_mixed_content() {
219 let input = "Normal text  more text";
220 let result = truncate_base64(input);
221 assert!(result.contains("Normal text"));
222 assert!(result.contains("[BASE64_TRUNCATED]"));
223 assert!(result.contains("more text"));
224 }
225
226 #[test]
227 fn test_truncate_base64_empty_string() {
228 let input = "";
229 let result = truncate_base64(input);
230 assert_eq!(result, "");
231 }
232
233 #[test]
234 fn test_truncate_base64_data_uri_without_comma() {
235 // Malformed data URI without comma
236 let input = "data:image/png;base64";
237 let result = truncate_base64(input);
238 // Should handle gracefully
239 assert_eq!(result, input);
240 }
241
242 #[test]
243 fn test_truncate_base64_long_without_special_chars() {
244 // Long string without + or / (likely not base64)
245 let input = "A".repeat(250);
246 let result = truncate_base64(&input);
247 // Should NOT be truncated (no base64 indicators)
248 assert_eq!(result, input);
249 }
250
251 #[test]
252 fn test_truncate_base64_exactly_200_chars() {
253 // Edge case: exactly 200 characters with base64 chars
254 let input = "A".repeat(199) + "+";
255 let result = truncate_base64(&input);
256 // Should be detected (200+ chars with +)
257 assert!(result.contains("[BASE64_TRUNCATED]"));
258 }
259}