infiniloom_engine/
content_processing.rs

1//! Content processing utilities for transforming file contents
2//!
3//! This module provides utilities for processing and transforming file content,
4//! particularly for optimizing content for LLM consumption by removing or
5//! truncating large binary/encoded data.
6//!
7//! # Features
8//!
9//! - **Base64 Detection and Truncation**: Automatically detects and truncates
10//!   base64-encoded content (data URIs, embedded images, etc.) to save tokens
11//! - **Pattern-based Processing**: Uses pre-compiled regex patterns for
12//!   efficient content transformation
13//!
14//! # Examples
15//!
16//! ## Truncating Base64 Content
17//!
18//! ```rust
19//! use infiniloom_engine::content_processing::truncate_base64;
20//!
21//! // Data URI with embedded image
22//! let content = "...";
23//! let truncated = truncate_base64(content);
24//! assert!(truncated.contains("[BASE64_TRUNCATED]"));
25//!
26//! // Regular text is preserved
27//! let text = "This is normal text";
28//! let result = truncate_base64(text);
29//! assert_eq!(result, text);
30//! ```
31//!
32//! # Performance
33//!
34//! - Uses `once_cell::sync::Lazy` for one-time regex compilation
35//! - Regex patterns are compiled once and reused across all calls
36//! - Efficient for processing large codebases with many files
37//!
38//! # Detection Rules
39//!
40//! The base64 detection looks for:
41//! - **Data URIs**: `data:[mimetype];base64,[content]`
42//! - **Long base64 strings**: Sequences of 200+ base64 characters
43//!
44//! Truncation behavior:
45//! - Data URIs: Preserves prefix, replaces content with `[BASE64_TRUNCATED]`
46//! - Long strings (>100 chars with +/): Shows first 50 chars + `...[BASE64_TRUNCATED]`
47//! - Short strings (<200 chars): Not truncated
48//! - Non-base64 text: Preserved unchanged
49
50use once_cell::sync::Lazy;
51use regex::Regex;
52
53/// Pre-compiled regex pattern for detecting base64 content
54///
55/// Matches:
56/// - `data:[mimetype];base64,[base64-content]` (data URIs)
57/// - Sequences of 200+ base64 characters (likely embedded data)
58///
59/// The pattern uses `[A-Za-z0-9+/]*={0,2}` to match valid base64 characters
60/// with optional padding (0-2 `=` characters at the end).
61static BASE64_PATTERN: Lazy<Regex> = Lazy::new(|| {
62    Regex::new(r"data:[^;]+;base64,[A-Za-z0-9+/]*={0,2}|[A-Za-z0-9+/]{200,}={0,2}").unwrap()
63});
64
65/// Truncate base64-encoded content in text to save tokens
66///
67/// This function detects and truncates large base64-encoded content (such as
68/// embedded images in data URIs or long base64 strings) to reduce token count
69/// while preserving the structure and meaning of the text.
70///
71/// # Arguments
72///
73/// * `content` - The text content to process
74///
75/// # Returns
76///
77/// A new string with base64 content truncated and replaced with markers.
78///
79/// # Detection and Truncation Rules
80///
81/// 1. **Data URIs** (e.g., `...`):
82///    - Preserves the MIME type prefix: `data:image/png;base64,`
83///    - Replaces the base64 content with: `[BASE64_TRUNCATED]`
84///    - Result: `data:image/png;base64,[BASE64_TRUNCATED]`
85///
86/// 2. **Long base64 strings** (200+ characters with `+` or `/`):
87///    - Shows first 50 characters
88///    - Appends: `...[BASE64_TRUNCATED]`
89///    - Result: `SGVsbG8gV29...ybGQ=...[BASE64_TRUNCATED]`
90///
91/// 3. **Short base64 strings** (<200 characters):
92///    - Not truncated (kept as-is)
93///
94/// 4. **Long strings without base64 characters** (no `+` or `/`):
95///    - Not truncated (likely not base64)
96///
97/// 5. **Regular text**:
98///    - Completely preserved
99///
100/// # Examples
101///
102/// ```rust,no_run
103/// use infiniloom_engine::content_processing::truncate_base64;
104///
105/// // Data URI truncation
106/// let data_uri = "...";
107/// let result = truncate_base64(data_uri);
108/// assert_eq!(result, "data:image/png;base64,[BASE64_TRUNCATED]");
109///
110/// // Long base64 string truncation
111/// let long_base64 = "A".repeat(250) + "+/";
112/// let result = truncate_base64(&long_base64);
113/// assert!(result.contains("[BASE64_TRUNCATED]"));
114///
115/// // Short base64 preserved
116/// let short = "SGVsbG8gV29ybGQ="; // "Hello World" in base64 (16 chars)
117/// let result = truncate_base64(short);
118/// assert_eq!(result, short); // Unchanged
119///
120/// // Regular text preserved
121/// let text = "This is regular code with no base64";
122/// let result = truncate_base64(text);
123/// assert_eq!(result, text); // Unchanged
124/// ```
125///
126/// # Performance
127///
128/// - Uses pre-compiled regex pattern (compiled once, reused forever)
129/// - Efficient replacement with `Regex::replace_all()`
130/// - Only allocates new string if matches are found
131///
132/// # Use Cases
133///
134/// - Reducing token count when packing repositories with embedded images
135/// - Removing large data URIs from HTML/CSS files
136/// - Truncating base64-encoded assets in configuration files
137/// - Optimizing content for LLM context windows
138pub fn truncate_base64(content: &str) -> String {
139    BASE64_PATTERN
140        .replace_all(content, |caps: &regex::Captures<'_>| {
141            let matched = caps.get(0).map_or("", |m| m.as_str());
142
143            // Handle data URIs (preserve MIME type prefix)
144            if matched.starts_with("data:") {
145                if let Some(comma_idx) = matched.find(',') {
146                    let prefix = &matched[..comma_idx + 1];
147                    format!("{}[BASE64_TRUNCATED]", prefix)
148                } else {
149                    "[BASE64_TRUNCATED]".to_owned()
150                }
151            }
152            // Handle long base64 strings (200+ chars)
153            else if matched.len() > 100 {
154                // Only truncate if it looks like base64 (has + or /)
155                if matched.contains('+') || matched.contains('/') {
156                    format!("{}...[BASE64_TRUNCATED]", &matched[..50])
157                } else {
158                    // Might not be base64, keep as-is
159                    matched.to_owned()
160                }
161            }
162            // Short strings: keep as-is
163            else {
164                matched.to_owned()
165            }
166        })
167        .to_string()
168}
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173
174    // ============================================
175    // truncate_base64 Tests
176    // ============================================
177
178    #[test]
179    fn test_truncate_base64_data_uri() {
180        let input = "";
181        let result = truncate_base64(input);
182        assert!(result.contains("data:image/png;base64,"));
183        assert!(result.contains("[BASE64_TRUNCATED]"));
184    }
185
186    #[test]
187    fn test_truncate_base64_long_string() {
188        // Long base64 string with + and / characters
189        let input = "A".repeat(150) + "+" + &"B".repeat(100) + "/";
190        let result = truncate_base64(&input);
191        assert!(result.contains("[BASE64_TRUNCATED]") || result.len() == input.len());
192    }
193
194    #[test]
195    fn test_truncate_base64_no_truncation_short() {
196        let input = "SGVsbG8gV29ybGQ="; // "Hello World" in base64
197        let result = truncate_base64(input);
198        // Short strings are not truncated
199        assert_eq!(result, input);
200    }
201
202    #[test]
203    fn test_truncate_base64_preserves_non_base64() {
204        let input = "This is regular text without base64";
205        let result = truncate_base64(input);
206        assert_eq!(result, input);
207    }
208
209    #[test]
210    fn test_truncate_base64_multiple_data_uris() {
211        let input = " and ";
212        let result = truncate_base64(input);
213        assert!(result.contains("data:image/png;base64,[BASE64_TRUNCATED]"));
214        assert!(result.contains("data:image/jpeg;base64,[BASE64_TRUNCATED]"));
215    }
216
217    #[test]
218    fn test_truncate_base64_mixed_content() {
219        let input = "Normal text  more text";
220        let result = truncate_base64(input);
221        assert!(result.contains("Normal text"));
222        assert!(result.contains("[BASE64_TRUNCATED]"));
223        assert!(result.contains("more text"));
224    }
225
226    #[test]
227    fn test_truncate_base64_empty_string() {
228        let input = "";
229        let result = truncate_base64(input);
230        assert_eq!(result, "");
231    }
232
233    #[test]
234    fn test_truncate_base64_data_uri_without_comma() {
235        // Malformed data URI without comma
236        let input = "data:image/png;base64";
237        let result = truncate_base64(input);
238        // Should handle gracefully
239        assert_eq!(result, input);
240    }
241
242    #[test]
243    fn test_truncate_base64_long_without_special_chars() {
244        // Long string without + or / (likely not base64)
245        let input = "A".repeat(250);
246        let result = truncate_base64(&input);
247        // Should NOT be truncated (no base64 indicators)
248        assert_eq!(result, input);
249    }
250
251    #[test]
252    fn test_truncate_base64_exactly_200_chars() {
253        // Edge case: exactly 200 characters with base64 chars
254        let input = "A".repeat(199) + "+";
255        let result = truncate_base64(&input);
256        // Should be detected (200+ chars with +)
257        assert!(result.contains("[BASE64_TRUNCATED]"));
258    }
259}