Skip to main content

codetether_agent/rlm/
chunker.rs

1//! Semantic chunking for large contexts
2//!
3//! Splits content intelligently at natural boundaries and prioritizes
4//! chunks for token budget selection.
5
6use serde::{Deserialize, Serialize};
7
8/// Content type for optimized processing
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
10#[serde(rename_all = "lowercase")]
11pub enum ContentType {
12    Code,
13    Documents,
14    Logs,
15    Conversation,
16    Mixed,
17}
18
19/// A chunk of content with metadata
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct Chunk {
22    pub content: String,
23    #[serde(rename = "type")]
24    pub chunk_type: ChunkType,
25    pub start_line: usize,
26    pub end_line: usize,
27    pub tokens: usize,
28    /// Higher = more important to keep
29    pub priority: u8,
30}
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum ChunkType {
35    Code,
36    Text,
37    ToolOutput,
38    Conversation,
39}
40
41/// Options for chunking
42#[derive(Debug, Clone)]
43pub struct ChunkOptions {
44    /// Maximum tokens per chunk
45    pub max_chunk_tokens: usize,
46    /// Number of recent lines to always preserve
47    pub preserve_recent: usize,
48}
49
50impl Default for ChunkOptions {
51    fn default() -> Self {
52        Self {
53            max_chunk_tokens: 4000,
54            preserve_recent: 100,
55        }
56    }
57}
58
59/// Semantic chunker for large contexts
60pub struct RlmChunker;
61
62impl RlmChunker {
63    /// Detect the primary type of content for optimized processing
64    pub fn detect_content_type(content: &str) -> ContentType {
65        let lines: Vec<&str> = content.lines().collect();
66        let sample_size = lines.len().min(200);
67
68        // Sample from head and tail
69        let sample: Vec<&str> = lines
70            .iter()
71            .take(sample_size / 2)
72            .chain(lines.iter().rev().take(sample_size / 2))
73            .copied()
74            .collect();
75
76        let mut code_indicators = 0;
77        let mut log_indicators = 0;
78        let mut conversation_indicators = 0;
79        let mut document_indicators = 0;
80
81        for line in &sample {
82            let trimmed = line.trim();
83
84            // Code indicators
85            if Self::is_code_line(trimmed) {
86                code_indicators += 1;
87            }
88
89            // Log indicators
90            if Self::is_log_line(trimmed) {
91                log_indicators += 1;
92            }
93
94            // Conversation indicators
95            if Self::is_conversation_line(trimmed) {
96                conversation_indicators += 1;
97            }
98
99            // Document indicators
100            if Self::is_document_line(trimmed) {
101                document_indicators += 1;
102            }
103        }
104
105        let total =
106            code_indicators + log_indicators + conversation_indicators + document_indicators;
107        if total == 0 {
108            return ContentType::Mixed;
109        }
110
111        let threshold = (total as f64 * 0.3) as usize;
112
113        if conversation_indicators > threshold {
114            ContentType::Conversation
115        } else if log_indicators > threshold {
116            ContentType::Logs
117        } else if code_indicators > threshold {
118            ContentType::Code
119        } else if document_indicators > threshold {
120            ContentType::Documents
121        } else {
122            ContentType::Mixed
123        }
124    }
125
126    fn is_code_line(line: &str) -> bool {
127        // Function/class/import definitions
128        let patterns = [
129            "function", "class ", "def ", "const ", "let ", "var ", "import ", "export ", "async ",
130            "fn ", "impl ", "struct ", "enum ", "pub ", "use ", "mod ", "trait ",
131        ];
132
133        if patterns.iter().any(|p| line.starts_with(p)) {
134            return true;
135        }
136
137        // Brace-only or semicolon-only lines
138        if matches!(line, "{" | "}" | "(" | ")" | ";" | "{}" | "};") {
139            return true;
140        }
141
142        // Comment lines
143        if line.starts_with("//")
144            || line.starts_with("#")
145            || line.starts_with("*")
146            || line.starts_with("/*")
147        {
148            return true;
149        }
150
151        false
152    }
153
154    fn is_log_line(line: &str) -> bool {
155        // ISO date prefix
156        if line.len() >= 10
157            && line.chars().take(4).all(|c| c.is_ascii_digit())
158            && line.chars().nth(4) == Some('-')
159        {
160            return true;
161        }
162
163        // Time prefix [HH:MM
164        if line.starts_with('[')
165            && line.len() > 5
166            && line.chars().nth(1).is_some_and(|c| c.is_ascii_digit())
167        {
168            return true;
169        }
170
171        // Log level prefixes
172        let log_levels = ["INFO", "DEBUG", "WARN", "ERROR", "FATAL", "TRACE"];
173        for level in log_levels {
174            if line.starts_with(level) || line.contains(&format!(" {} ", level)) {
175                return true;
176            }
177        }
178
179        false
180    }
181
182    fn is_conversation_line(line: &str) -> bool {
183        let patterns = [
184            "[User]:",
185            "[Assistant]:",
186            "[Human]:",
187            "[AI]:",
188            "User:",
189            "Assistant:",
190            "Human:",
191            "AI:",
192            "[Tool ",
193            "<user>",
194            "<assistant>",
195            "<system>",
196        ];
197        patterns.iter().any(|p| line.starts_with(p))
198    }
199
200    fn is_document_line(line: &str) -> bool {
201        // Markdown headers
202        if line.starts_with('#') && line.chars().nth(1).is_some_and(|c| c == ' ' || c == '#') {
203            return true;
204        }
205
206        // Bold text
207        if line.starts_with("**") && line.contains("**") {
208            return true;
209        }
210
211        // Blockquotes
212        if line.starts_with("> ") {
213            return true;
214        }
215
216        // List items
217        if line.starts_with("- ") && line.len() > 3 {
218            return true;
219        }
220
221        // Long prose lines without code terminators
222        if line.len() > 80
223            && !line.ends_with('{')
224            && !line.ends_with(';')
225            && !line.ends_with('(')
226            && !line.ends_with(')')
227            && !line.ends_with('=')
228        {
229            return true;
230        }
231
232        false
233    }
234
235    /// Get processing hints based on content type
236    pub fn get_processing_hints(content_type: ContentType) -> &'static str {
237        match content_type {
238            ContentType::Code => {
239                "This appears to be source code. Focus on:\n\
240                 - Function/class definitions and their purposes\n\
241                 - Import statements and dependencies\n\
242                 - Error handling patterns\n\
243                 - Key algorithms and logic flow"
244            }
245            ContentType::Logs => {
246                "This appears to be log output. Focus on:\n\
247                 - Error and warning messages\n\
248                 - Timestamps and event sequences\n\
249                 - Stack traces and exceptions\n\
250                 - Key events and state changes"
251            }
252            ContentType::Conversation => {
253                "This appears to be conversation history. Focus on:\n\
254                 - User's original request/goal\n\
255                 - Key decisions made\n\
256                 - Tool calls and their results\n\
257                 - Current state and pending tasks"
258            }
259            ContentType::Documents => {
260                "This appears to be documentation or prose. Focus on:\n\
261                 - Main topics and structure\n\
262                 - Key information and facts\n\
263                 - Actionable items\n\
264                 - References and links"
265            }
266            ContentType::Mixed => {
267                "Mixed content detected. Analyze the structure first, then extract key information."
268            }
269        }
270    }
271
272    /// Estimate token count (roughly 4 chars per token)
273    pub fn estimate_tokens(text: &str) -> usize {
274        text.len().div_ceil(4)
275    }
276
277    /// Split content into semantic chunks
278    pub fn chunk(content: &str, options: Option<ChunkOptions>) -> Vec<Chunk> {
279        let opts = options.unwrap_or_default();
280        let lines: Vec<&str> = content.lines().collect();
281        let mut chunks = Vec::new();
282
283        // Find semantic boundaries
284        let boundaries = Self::find_boundaries(&lines);
285
286        let mut current_chunk: Vec<&str> = Vec::new();
287        let mut current_type = ChunkType::Text;
288        let mut current_start = 0;
289        let mut current_priority: u8 = 1;
290
291        for (i, line) in lines.iter().enumerate() {
292            // Check if we hit a boundary
293            if let Some((boundary_type, boundary_priority)) = boundaries.get(&i) {
294                if !current_chunk.is_empty() {
295                    let content = current_chunk.join("\n");
296                    let tokens = Self::estimate_tokens(&content);
297
298                    // If chunk is too big, split it
299                    if tokens > opts.max_chunk_tokens {
300                        let sub_chunks = Self::split_large_chunk(
301                            &current_chunk,
302                            current_start,
303                            current_type,
304                            opts.max_chunk_tokens,
305                        );
306                        chunks.extend(sub_chunks);
307                    } else {
308                        chunks.push(Chunk {
309                            content,
310                            chunk_type: current_type,
311                            start_line: current_start,
312                            end_line: i.saturating_sub(1),
313                            tokens,
314                            priority: current_priority,
315                        });
316                    }
317
318                    current_chunk = Vec::new();
319                    current_start = i;
320                    current_type = *boundary_type;
321                    current_priority = *boundary_priority;
322                }
323            }
324
325            current_chunk.push(line);
326
327            // Boost priority for recent lines
328            if i >= lines.len().saturating_sub(opts.preserve_recent) {
329                current_priority = current_priority.max(8);
330            }
331        }
332
333        // Final chunk
334        if !current_chunk.is_empty() {
335            let content = current_chunk.join("\n");
336            let tokens = Self::estimate_tokens(&content);
337
338            if tokens > opts.max_chunk_tokens {
339                let sub_chunks = Self::split_large_chunk(
340                    &current_chunk,
341                    current_start,
342                    current_type,
343                    opts.max_chunk_tokens,
344                );
345                chunks.extend(sub_chunks);
346            } else {
347                chunks.push(Chunk {
348                    content,
349                    chunk_type: current_type,
350                    start_line: current_start,
351                    end_line: lines.len().saturating_sub(1),
352                    tokens,
353                    priority: current_priority,
354                });
355            }
356        }
357
358        chunks
359    }
360
361    /// Find semantic boundaries in content
362    fn find_boundaries(lines: &[&str]) -> std::collections::HashMap<usize, (ChunkType, u8)> {
363        let mut boundaries = std::collections::HashMap::new();
364
365        for (i, line) in lines.iter().enumerate() {
366            let trimmed = line.trim();
367
368            // User/Assistant message markers
369            if trimmed.starts_with("[User]:") || trimmed.starts_with("[Assistant]:") {
370                boundaries.insert(i, (ChunkType::Conversation, 5));
371                continue;
372            }
373
374            // Tool output markers
375            if trimmed.starts_with("[Tool ") {
376                let priority = if trimmed.contains("FAILED") || trimmed.contains("error") {
377                    7
378                } else {
379                    3
380                };
381                boundaries.insert(i, (ChunkType::ToolOutput, priority));
382                continue;
383            }
384
385            // Code block markers
386            if trimmed.starts_with("```") {
387                boundaries.insert(i, (ChunkType::Code, 4));
388                continue;
389            }
390
391            // File path markers
392            if trimmed.starts_with('/') || trimmed.starts_with("./") || trimmed.starts_with("~/") {
393                boundaries.insert(i, (ChunkType::Code, 4));
394                continue;
395            }
396
397            // Function/class definitions
398            let def_patterns = [
399                "function",
400                "class ",
401                "def ",
402                "async function",
403                "export",
404                "fn ",
405                "impl ",
406                "struct ",
407                "enum ",
408            ];
409            if def_patterns.iter().any(|p| trimmed.starts_with(p)) {
410                boundaries.insert(i, (ChunkType::Code, 5));
411                continue;
412            }
413
414            // Error markers
415            if trimmed.to_lowercase().starts_with("error")
416                || trimmed.to_lowercase().contains("error:")
417                || trimmed.starts_with("Exception")
418                || trimmed.contains("FAILED")
419            {
420                boundaries.insert(i, (ChunkType::Text, 8));
421                continue;
422            }
423
424            // Section headers
425            if trimmed.starts_with('#') && trimmed.len() > 2 && trimmed.chars().nth(1) == Some(' ')
426            {
427                boundaries.insert(i, (ChunkType::Text, 6));
428                continue;
429            }
430        }
431
432        boundaries
433    }
434
435    /// Split a large chunk into smaller pieces
436    fn split_large_chunk(
437        lines: &[&str],
438        start_line: usize,
439        chunk_type: ChunkType,
440        max_tokens: usize,
441    ) -> Vec<Chunk> {
442        let mut chunks = Vec::new();
443        let mut current: Vec<&str> = Vec::new();
444        let mut current_tokens = 0;
445        let mut current_start = start_line;
446
447        for (i, line) in lines.iter().enumerate() {
448            let line_tokens = Self::estimate_tokens(line);
449
450            if current_tokens + line_tokens > max_tokens && !current.is_empty() {
451                chunks.push(Chunk {
452                    content: current.join("\n"),
453                    chunk_type,
454                    start_line: current_start,
455                    end_line: start_line + i - 1,
456                    tokens: current_tokens,
457                    priority: 3,
458                });
459                current = Vec::new();
460                current_tokens = 0;
461                current_start = start_line + i;
462            }
463
464            current.push(line);
465            current_tokens += line_tokens;
466        }
467
468        if !current.is_empty() {
469            chunks.push(Chunk {
470                content: current.join("\n"),
471                chunk_type,
472                start_line: current_start,
473                end_line: start_line + lines.len() - 1,
474                tokens: current_tokens,
475                priority: 3,
476            });
477        }
478
479        chunks
480    }
481
482    /// Select chunks to fit within a token budget
483    /// Prioritizes high-priority chunks and recent content
484    pub fn select_chunks(chunks: &[Chunk], max_tokens: usize) -> Vec<Chunk> {
485        let mut sorted: Vec<_> = chunks.to_vec();
486
487        // Sort by priority (desc), then by line number (desc for recent)
488        sorted.sort_by(|a, b| match b.priority.cmp(&a.priority) {
489            std::cmp::Ordering::Equal => b.start_line.cmp(&a.start_line),
490            other => other,
491        });
492
493        let mut selected = Vec::new();
494        let mut total_tokens = 0;
495
496        for chunk in sorted {
497            if total_tokens + chunk.tokens <= max_tokens {
498                selected.push(chunk.clone());
499                total_tokens += chunk.tokens;
500            }
501        }
502
503        // Re-sort by line number for coherent output
504        selected.sort_by_key(|c| c.start_line);
505
506        selected
507    }
508
509    /// Reassemble selected chunks into a single string
510    pub fn reassemble(chunks: &[Chunk]) -> String {
511        if chunks.is_empty() {
512            return String::new();
513        }
514
515        let mut parts = Vec::new();
516        let mut last_end: Option<usize> = None;
517
518        for chunk in chunks {
519            // Add separator if there's a gap
520            if let Some(end) = last_end {
521                if chunk.start_line > end + 1 {
522                    let gap = chunk.start_line - end - 1;
523                    parts.push(format!("\n[... {} lines omitted ...]\n", gap));
524                }
525            }
526            parts.push(chunk.content.clone());
527            last_end = Some(chunk.end_line);
528        }
529
530        parts.join("\n")
531    }
532
533    /// Intelligently compress content to fit within token budget
534    pub fn compress(content: &str, max_tokens: usize, options: Option<ChunkOptions>) -> String {
535        let chunks = Self::chunk(content, options);
536        let selected = Self::select_chunks(&chunks, max_tokens);
537        Self::reassemble(&selected)
538    }
539}
540
541#[cfg(test)]
542mod tests {
543    use super::*;
544
545    #[test]
546    fn test_detect_code() {
547        let content = r#"
548fn main() {
549    println!("Hello, world!");
550}
551
552impl Foo {
553    pub fn new() -> Self {
554        Self {}
555    }
556}
557"#;
558        assert_eq!(RlmChunker::detect_content_type(content), ContentType::Code);
559    }
560
561    #[test]
562    fn test_detect_conversation() {
563        let content = r#"
564[User]: Can you help me with this?
565
566[Assistant]: Of course! What do you need?
567
568[User]: I want to implement a feature.
569"#;
570        assert_eq!(
571            RlmChunker::detect_content_type(content),
572            ContentType::Conversation
573        );
574    }
575
576    #[test]
577    fn test_compress() {
578        let content = "line\n".repeat(1000);
579        let compressed = RlmChunker::compress(&content, 100, None);
580        let tokens = RlmChunker::estimate_tokens(&compressed);
581        assert!(tokens <= 100 || compressed.contains("[..."));
582    }
583}