1use serde::{Deserialize, Serialize};
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
10#[serde(rename_all = "lowercase")]
11pub enum ContentType {
12 Code,
13 Documents,
14 Logs,
15 Conversation,
16 Mixed,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct Chunk {
22 pub content: String,
23 #[serde(rename = "type")]
24 pub chunk_type: ChunkType,
25 pub start_line: usize,
26 pub end_line: usize,
27 pub tokens: usize,
28 pub priority: u8,
30}
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum ChunkType {
35 Code,
36 Text,
37 ToolOutput,
38 Conversation,
39}
40
41#[derive(Debug, Clone)]
43pub struct ChunkOptions {
44 pub max_chunk_tokens: usize,
46 pub preserve_recent: usize,
48}
49
50impl Default for ChunkOptions {
51 fn default() -> Self {
52 Self {
53 max_chunk_tokens: 4000,
54 preserve_recent: 100,
55 }
56 }
57}
58
59pub struct RlmChunker;
61
62impl RlmChunker {
63 pub fn detect_content_type(content: &str) -> ContentType {
65 let lines: Vec<&str> = content.lines().collect();
66 let sample_size = lines.len().min(200);
67
68 let sample: Vec<&str> = lines
70 .iter()
71 .take(sample_size / 2)
72 .chain(lines.iter().rev().take(sample_size / 2))
73 .copied()
74 .collect();
75
76 let mut code_indicators = 0;
77 let mut log_indicators = 0;
78 let mut conversation_indicators = 0;
79 let mut document_indicators = 0;
80
81 for line in &sample {
82 let trimmed = line.trim();
83
84 if Self::is_code_line(trimmed) {
86 code_indicators += 1;
87 }
88
89 if Self::is_log_line(trimmed) {
91 log_indicators += 1;
92 }
93
94 if Self::is_conversation_line(trimmed) {
96 conversation_indicators += 1;
97 }
98
99 if Self::is_document_line(trimmed) {
101 document_indicators += 1;
102 }
103 }
104
105 let total =
106 code_indicators + log_indicators + conversation_indicators + document_indicators;
107 if total == 0 {
108 return ContentType::Mixed;
109 }
110
111 let threshold = (total as f64 * 0.3) as usize;
112
113 if conversation_indicators > threshold {
114 ContentType::Conversation
115 } else if log_indicators > threshold {
116 ContentType::Logs
117 } else if code_indicators > threshold {
118 ContentType::Code
119 } else if document_indicators > threshold {
120 ContentType::Documents
121 } else {
122 ContentType::Mixed
123 }
124 }
125
126 fn is_code_line(line: &str) -> bool {
127 let patterns = [
129 "function", "class ", "def ", "const ", "let ", "var ", "import ", "export ", "async ",
130 "fn ", "impl ", "struct ", "enum ", "pub ", "use ", "mod ", "trait ",
131 ];
132
133 if patterns.iter().any(|p| line.starts_with(p)) {
134 return true;
135 }
136
137 if matches!(line, "{" | "}" | "(" | ")" | ";" | "{}" | "};") {
139 return true;
140 }
141
142 if line.starts_with("//")
144 || line.starts_with("#")
145 || line.starts_with("*")
146 || line.starts_with("/*")
147 {
148 return true;
149 }
150
151 false
152 }
153
154 fn is_log_line(line: &str) -> bool {
155 if line.len() >= 10
157 && line.chars().take(4).all(|c| c.is_ascii_digit())
158 && line.chars().nth(4) == Some('-')
159 {
160 return true;
161 }
162
163 if line.starts_with('[')
165 && line.len() > 5
166 && line.chars().nth(1).is_some_and(|c| c.is_ascii_digit())
167 {
168 return true;
169 }
170
171 let log_levels = ["INFO", "DEBUG", "WARN", "ERROR", "FATAL", "TRACE"];
173 for level in log_levels {
174 if line.starts_with(level) || line.contains(&format!(" {} ", level)) {
175 return true;
176 }
177 }
178
179 false
180 }
181
182 fn is_conversation_line(line: &str) -> bool {
183 let patterns = [
184 "[User]:",
185 "[Assistant]:",
186 "[Human]:",
187 "[AI]:",
188 "User:",
189 "Assistant:",
190 "Human:",
191 "AI:",
192 "[Tool ",
193 "<user>",
194 "<assistant>",
195 "<system>",
196 ];
197 patterns.iter().any(|p| line.starts_with(p))
198 }
199
200 fn is_document_line(line: &str) -> bool {
201 if line.starts_with('#') && line.chars().nth(1).is_some_and(|c| c == ' ' || c == '#') {
203 return true;
204 }
205
206 if line.starts_with("**") && line.contains("**") {
208 return true;
209 }
210
211 if line.starts_with("> ") {
213 return true;
214 }
215
216 if line.starts_with("- ") && line.len() > 3 {
218 return true;
219 }
220
221 if line.len() > 80
223 && !line.ends_with('{')
224 && !line.ends_with(';')
225 && !line.ends_with('(')
226 && !line.ends_with(')')
227 && !line.ends_with('=')
228 {
229 return true;
230 }
231
232 false
233 }
234
235 pub fn get_processing_hints(content_type: ContentType) -> &'static str {
237 match content_type {
238 ContentType::Code => {
239 "This appears to be source code. Focus on:\n\
240 - Function/class definitions and their purposes\n\
241 - Import statements and dependencies\n\
242 - Error handling patterns\n\
243 - Key algorithms and logic flow"
244 }
245 ContentType::Logs => {
246 "This appears to be log output. Focus on:\n\
247 - Error and warning messages\n\
248 - Timestamps and event sequences\n\
249 - Stack traces and exceptions\n\
250 - Key events and state changes"
251 }
252 ContentType::Conversation => {
253 "This appears to be conversation history. Focus on:\n\
254 - User's original request/goal\n\
255 - Key decisions made\n\
256 - Tool calls and their results\n\
257 - Current state and pending tasks"
258 }
259 ContentType::Documents => {
260 "This appears to be documentation or prose. Focus on:\n\
261 - Main topics and structure\n\
262 - Key information and facts\n\
263 - Actionable items\n\
264 - References and links"
265 }
266 ContentType::Mixed => {
267 "Mixed content detected. Analyze the structure first, then extract key information."
268 }
269 }
270 }
271
272 pub fn estimate_tokens(text: &str) -> usize {
274 text.len().div_ceil(4)
275 }
276
277 pub fn chunk(content: &str, options: Option<ChunkOptions>) -> Vec<Chunk> {
279 let opts = options.unwrap_or_default();
280 let lines: Vec<&str> = content.lines().collect();
281 let mut chunks = Vec::new();
282
283 let boundaries = Self::find_boundaries(&lines);
285
286 let mut current_chunk: Vec<&str> = Vec::new();
287 let mut current_type = ChunkType::Text;
288 let mut current_start = 0;
289 let mut current_priority: u8 = 1;
290
291 for (i, line) in lines.iter().enumerate() {
292 if let Some((boundary_type, boundary_priority)) = boundaries.get(&i) {
294 if !current_chunk.is_empty() {
295 let content = current_chunk.join("\n");
296 let tokens = Self::estimate_tokens(&content);
297
298 if tokens > opts.max_chunk_tokens {
300 let sub_chunks = Self::split_large_chunk(
301 ¤t_chunk,
302 current_start,
303 current_type,
304 opts.max_chunk_tokens,
305 );
306 chunks.extend(sub_chunks);
307 } else {
308 chunks.push(Chunk {
309 content,
310 chunk_type: current_type,
311 start_line: current_start,
312 end_line: i.saturating_sub(1),
313 tokens,
314 priority: current_priority,
315 });
316 }
317
318 current_chunk = Vec::new();
319 current_start = i;
320 current_type = *boundary_type;
321 current_priority = *boundary_priority;
322 }
323 }
324
325 current_chunk.push(line);
326
327 if i >= lines.len().saturating_sub(opts.preserve_recent) {
329 current_priority = current_priority.max(8);
330 }
331 }
332
333 if !current_chunk.is_empty() {
335 let content = current_chunk.join("\n");
336 let tokens = Self::estimate_tokens(&content);
337
338 if tokens > opts.max_chunk_tokens {
339 let sub_chunks = Self::split_large_chunk(
340 ¤t_chunk,
341 current_start,
342 current_type,
343 opts.max_chunk_tokens,
344 );
345 chunks.extend(sub_chunks);
346 } else {
347 chunks.push(Chunk {
348 content,
349 chunk_type: current_type,
350 start_line: current_start,
351 end_line: lines.len().saturating_sub(1),
352 tokens,
353 priority: current_priority,
354 });
355 }
356 }
357
358 chunks
359 }
360
361 fn find_boundaries(lines: &[&str]) -> std::collections::HashMap<usize, (ChunkType, u8)> {
363 let mut boundaries = std::collections::HashMap::new();
364
365 for (i, line) in lines.iter().enumerate() {
366 let trimmed = line.trim();
367
368 if trimmed.starts_with("[User]:") || trimmed.starts_with("[Assistant]:") {
370 boundaries.insert(i, (ChunkType::Conversation, 5));
371 continue;
372 }
373
374 if trimmed.starts_with("[Tool ") {
376 let priority = if trimmed.contains("FAILED") || trimmed.contains("error") {
377 7
378 } else {
379 3
380 };
381 boundaries.insert(i, (ChunkType::ToolOutput, priority));
382 continue;
383 }
384
385 if trimmed.starts_with("```") {
387 boundaries.insert(i, (ChunkType::Code, 4));
388 continue;
389 }
390
391 if trimmed.starts_with('/') || trimmed.starts_with("./") || trimmed.starts_with("~/") {
393 boundaries.insert(i, (ChunkType::Code, 4));
394 continue;
395 }
396
397 let def_patterns = [
399 "function",
400 "class ",
401 "def ",
402 "async function",
403 "export",
404 "fn ",
405 "impl ",
406 "struct ",
407 "enum ",
408 ];
409 if def_patterns.iter().any(|p| trimmed.starts_with(p)) {
410 boundaries.insert(i, (ChunkType::Code, 5));
411 continue;
412 }
413
414 if trimmed.to_lowercase().starts_with("error")
416 || trimmed.to_lowercase().contains("error:")
417 || trimmed.starts_with("Exception")
418 || trimmed.contains("FAILED")
419 {
420 boundaries.insert(i, (ChunkType::Text, 8));
421 continue;
422 }
423
424 if trimmed.starts_with('#') && trimmed.len() > 2 && trimmed.chars().nth(1) == Some(' ')
426 {
427 boundaries.insert(i, (ChunkType::Text, 6));
428 continue;
429 }
430 }
431
432 boundaries
433 }
434
435 fn split_large_chunk(
437 lines: &[&str],
438 start_line: usize,
439 chunk_type: ChunkType,
440 max_tokens: usize,
441 ) -> Vec<Chunk> {
442 let mut chunks = Vec::new();
443 let mut current: Vec<&str> = Vec::new();
444 let mut current_tokens = 0;
445 let mut current_start = start_line;
446
447 for (i, line) in lines.iter().enumerate() {
448 let line_tokens = Self::estimate_tokens(line);
449
450 if current_tokens + line_tokens > max_tokens && !current.is_empty() {
451 chunks.push(Chunk {
452 content: current.join("\n"),
453 chunk_type,
454 start_line: current_start,
455 end_line: start_line + i - 1,
456 tokens: current_tokens,
457 priority: 3,
458 });
459 current = Vec::new();
460 current_tokens = 0;
461 current_start = start_line + i;
462 }
463
464 current.push(line);
465 current_tokens += line_tokens;
466 }
467
468 if !current.is_empty() {
469 chunks.push(Chunk {
470 content: current.join("\n"),
471 chunk_type,
472 start_line: current_start,
473 end_line: start_line + lines.len() - 1,
474 tokens: current_tokens,
475 priority: 3,
476 });
477 }
478
479 chunks
480 }
481
482 pub fn select_chunks(chunks: &[Chunk], max_tokens: usize) -> Vec<Chunk> {
485 let mut sorted: Vec<_> = chunks.to_vec();
486
487 sorted.sort_by(|a, b| match b.priority.cmp(&a.priority) {
489 std::cmp::Ordering::Equal => b.start_line.cmp(&a.start_line),
490 other => other,
491 });
492
493 let mut selected = Vec::new();
494 let mut total_tokens = 0;
495
496 for chunk in sorted {
497 if total_tokens + chunk.tokens <= max_tokens {
498 selected.push(chunk.clone());
499 total_tokens += chunk.tokens;
500 }
501 }
502
503 selected.sort_by_key(|c| c.start_line);
505
506 selected
507 }
508
509 pub fn reassemble(chunks: &[Chunk]) -> String {
511 if chunks.is_empty() {
512 return String::new();
513 }
514
515 let mut parts = Vec::new();
516 let mut last_end: Option<usize> = None;
517
518 for chunk in chunks {
519 if let Some(end) = last_end {
521 if chunk.start_line > end + 1 {
522 let gap = chunk.start_line - end - 1;
523 parts.push(format!("\n[... {} lines omitted ...]\n", gap));
524 }
525 }
526 parts.push(chunk.content.clone());
527 last_end = Some(chunk.end_line);
528 }
529
530 parts.join("\n")
531 }
532
533 pub fn compress(content: &str, max_tokens: usize, options: Option<ChunkOptions>) -> String {
535 let chunks = Self::chunk(content, options);
536 let selected = Self::select_chunks(&chunks, max_tokens);
537 Self::reassemble(&selected)
538 }
539}
540
541#[cfg(test)]
542mod tests {
543 use super::*;
544
545 #[test]
546 fn test_detect_code() {
547 let content = r#"
548fn main() {
549 println!("Hello, world!");
550}
551
552impl Foo {
553 pub fn new() -> Self {
554 Self {}
555 }
556}
557"#;
558 assert_eq!(RlmChunker::detect_content_type(content), ContentType::Code);
559 }
560
561 #[test]
562 fn test_detect_conversation() {
563 let content = r#"
564[User]: Can you help me with this?
565
566[Assistant]: Of course! What do you need?
567
568[User]: I want to implement a feature.
569"#;
570 assert_eq!(
571 RlmChunker::detect_content_type(content),
572 ContentType::Conversation
573 );
574 }
575
576 #[test]
577 fn test_compress() {
578 let content = "line\n".repeat(1000);
579 let compressed = RlmChunker::compress(&content, 100, None);
580 let tokens = RlmChunker::estimate_tokens(&compressed);
581 assert!(tokens <= 100 || compressed.contains("[..."));
582 }
583}