use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ContentType {
Code,
Documents,
Logs,
Conversation,
Mixed,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Chunk {
pub content: String,
#[serde(rename = "type")]
pub chunk_type: ChunkType,
pub start_line: usize,
pub end_line: usize,
pub tokens: usize,
pub priority: u8,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ChunkType {
Code,
Text,
ToolOutput,
Conversation,
}
#[derive(Debug, Clone)]
pub struct ChunkOptions {
pub max_chunk_tokens: usize,
pub preserve_recent: usize,
}
impl Default for ChunkOptions {
fn default() -> Self {
Self {
max_chunk_tokens: 4000,
preserve_recent: 100,
}
}
}
pub struct RlmChunker;
impl RlmChunker {
pub fn detect_content_type(content: &str) -> ContentType {
let lines: Vec<&str> = content.lines().collect();
let sample_size = lines.len().min(200);
let sample: Vec<&str> = lines
.iter()
.take(sample_size / 2)
.chain(lines.iter().rev().take(sample_size / 2))
.copied()
.collect();
let mut code_indicators = 0;
let mut log_indicators = 0;
let mut conversation_indicators = 0;
let mut document_indicators = 0;
for line in &sample {
let trimmed = line.trim();
if Self::is_code_line(trimmed) {
code_indicators += 1;
}
if Self::is_log_line(trimmed) {
log_indicators += 1;
}
if Self::is_conversation_line(trimmed) {
conversation_indicators += 1;
}
if Self::is_document_line(trimmed) {
document_indicators += 1;
}
}
let total =
code_indicators + log_indicators + conversation_indicators + document_indicators;
if total == 0 {
return ContentType::Mixed;
}
let threshold = (total as f64 * 0.3) as usize;
if conversation_indicators > threshold {
ContentType::Conversation
} else if log_indicators > threshold {
ContentType::Logs
} else if code_indicators > threshold {
ContentType::Code
} else if document_indicators > threshold {
ContentType::Documents
} else {
ContentType::Mixed
}
}
fn is_code_line(line: &str) -> bool {
let patterns = [
"function", "class ", "def ", "const ", "let ", "var ", "import ", "export ", "async ",
"fn ", "impl ", "struct ", "enum ", "pub ", "use ", "mod ", "trait ",
];
if patterns.iter().any(|p| line.starts_with(p)) {
return true;
}
if matches!(line, "{" | "}" | "(" | ")" | ";" | "{}" | "};") {
return true;
}
if line.starts_with("//")
|| line.starts_with("#")
|| line.starts_with("*")
|| line.starts_with("/*")
{
return true;
}
false
}
fn is_log_line(line: &str) -> bool {
if line.len() >= 10
&& line.chars().take(4).all(|c| c.is_ascii_digit())
&& line.chars().nth(4) == Some('-')
{
return true;
}
if line.starts_with('[')
&& line.len() > 5
&& line.chars().nth(1).is_some_and(|c| c.is_ascii_digit())
{
return true;
}
let log_levels = ["INFO", "DEBUG", "WARN", "ERROR", "FATAL", "TRACE"];
for level in log_levels {
if line.starts_with(level) || line.contains(&format!(" {} ", level)) {
return true;
}
}
false
}
fn is_conversation_line(line: &str) -> bool {
let patterns = [
"[User]:",
"[Assistant]:",
"[Human]:",
"[AI]:",
"User:",
"Assistant:",
"Human:",
"AI:",
"[Tool ",
"<user>",
"<assistant>",
"<system>",
];
patterns.iter().any(|p| line.starts_with(p))
}
fn is_document_line(line: &str) -> bool {
if line.starts_with('#') && line.chars().nth(1).is_some_and(|c| c == ' ' || c == '#') {
return true;
}
if line.starts_with("**") && line.contains("**") {
return true;
}
if line.starts_with("> ") {
return true;
}
if line.starts_with("- ") && line.len() > 3 {
return true;
}
if line.len() > 80
&& !line.ends_with('{')
&& !line.ends_with(';')
&& !line.ends_with('(')
&& !line.ends_with(')')
&& !line.ends_with('=')
{
return true;
}
false
}
pub fn get_processing_hints(content_type: ContentType) -> &'static str {
match content_type {
ContentType::Code => {
"This appears to be source code. Focus on:\n\
- Function/class definitions and their purposes\n\
- Import statements and dependencies\n\
- Error handling patterns\n\
- Key algorithms and logic flow"
}
ContentType::Logs => {
"This appears to be log output. Focus on:\n\
- Error and warning messages\n\
- Timestamps and event sequences\n\
- Stack traces and exceptions\n\
- Key events and state changes"
}
ContentType::Conversation => {
"This appears to be conversation history. Focus on:\n\
- User's original request/goal\n\
- Key decisions made\n\
- Tool calls and their results\n\
- Current state and pending tasks"
}
ContentType::Documents => {
"This appears to be documentation or prose. Focus on:\n\
- Main topics and structure\n\
- Key information and facts\n\
- Actionable items\n\
- References and links"
}
ContentType::Mixed => {
"Mixed content detected. Analyze the structure first, then extract key information."
}
}
}
pub fn estimate_tokens(text: &str) -> usize {
text.len().div_ceil(4)
}
pub fn chunk(content: &str, options: Option<ChunkOptions>) -> Vec<Chunk> {
let opts = options.unwrap_or_default();
let lines: Vec<&str> = content.lines().collect();
let mut chunks = Vec::new();
let boundaries = Self::find_boundaries(&lines);
let mut current_chunk: Vec<&str> = Vec::new();
let mut current_type = ChunkType::Text;
let mut current_start = 0;
let mut current_priority: u8 = 1;
for (i, line) in lines.iter().enumerate() {
if let Some((boundary_type, boundary_priority)) = boundaries.get(&i) {
if !current_chunk.is_empty() {
let content = current_chunk.join("\n");
let tokens = Self::estimate_tokens(&content);
if tokens > opts.max_chunk_tokens {
let sub_chunks = Self::split_large_chunk(
¤t_chunk,
current_start,
current_type,
opts.max_chunk_tokens,
);
chunks.extend(sub_chunks);
} else {
chunks.push(Chunk {
content,
chunk_type: current_type,
start_line: current_start,
end_line: i.saturating_sub(1),
tokens,
priority: current_priority,
});
}
current_chunk = Vec::new();
current_start = i;
current_type = *boundary_type;
current_priority = *boundary_priority;
}
}
current_chunk.push(line);
if i >= lines.len().saturating_sub(opts.preserve_recent) {
current_priority = current_priority.max(8);
}
}
if !current_chunk.is_empty() {
let content = current_chunk.join("\n");
let tokens = Self::estimate_tokens(&content);
if tokens > opts.max_chunk_tokens {
let sub_chunks = Self::split_large_chunk(
¤t_chunk,
current_start,
current_type,
opts.max_chunk_tokens,
);
chunks.extend(sub_chunks);
} else {
chunks.push(Chunk {
content,
chunk_type: current_type,
start_line: current_start,
end_line: lines.len().saturating_sub(1),
tokens,
priority: current_priority,
});
}
}
chunks
}
fn find_boundaries(lines: &[&str]) -> std::collections::HashMap<usize, (ChunkType, u8)> {
let mut boundaries = std::collections::HashMap::new();
for (i, line) in lines.iter().enumerate() {
let trimmed = line.trim();
if trimmed.starts_with("[User]:") || trimmed.starts_with("[Assistant]:") {
boundaries.insert(i, (ChunkType::Conversation, 5));
continue;
}
if trimmed.starts_with("[Tool ") {
let priority = if trimmed.contains("FAILED") || trimmed.contains("error") {
7
} else {
3
};
boundaries.insert(i, (ChunkType::ToolOutput, priority));
continue;
}
if trimmed.starts_with("```") {
boundaries.insert(i, (ChunkType::Code, 4));
continue;
}
if trimmed.starts_with('/') || trimmed.starts_with("./") || trimmed.starts_with("~/") {
boundaries.insert(i, (ChunkType::Code, 4));
continue;
}
let def_patterns = [
"function",
"class ",
"def ",
"async function",
"export",
"fn ",
"impl ",
"struct ",
"enum ",
];
if def_patterns.iter().any(|p| trimmed.starts_with(p)) {
boundaries.insert(i, (ChunkType::Code, 5));
continue;
}
if trimmed.to_lowercase().starts_with("error")
|| trimmed.to_lowercase().contains("error:")
|| trimmed.starts_with("Exception")
|| trimmed.contains("FAILED")
{
boundaries.insert(i, (ChunkType::Text, 8));
continue;
}
if trimmed.starts_with('#') && trimmed.len() > 2 && trimmed.chars().nth(1) == Some(' ')
{
boundaries.insert(i, (ChunkType::Text, 6));
continue;
}
}
boundaries
}
fn split_large_chunk(
lines: &[&str],
start_line: usize,
chunk_type: ChunkType,
max_tokens: usize,
) -> Vec<Chunk> {
let mut chunks = Vec::new();
let mut current: Vec<&str> = Vec::new();
let mut current_tokens = 0;
let mut current_start = start_line;
for (i, line) in lines.iter().enumerate() {
let line_tokens = Self::estimate_tokens(line);
if current_tokens + line_tokens > max_tokens && !current.is_empty() {
chunks.push(Chunk {
content: current.join("\n"),
chunk_type,
start_line: current_start,
end_line: start_line + i - 1,
tokens: current_tokens,
priority: 3,
});
current = Vec::new();
current_tokens = 0;
current_start = start_line + i;
}
current.push(line);
current_tokens += line_tokens;
}
if !current.is_empty() {
chunks.push(Chunk {
content: current.join("\n"),
chunk_type,
start_line: current_start,
end_line: start_line + lines.len() - 1,
tokens: current_tokens,
priority: 3,
});
}
chunks
}
pub fn select_chunks(chunks: &[Chunk], max_tokens: usize) -> Vec<Chunk> {
let mut sorted: Vec<_> = chunks.to_vec();
sorted.sort_by(|a, b| match b.priority.cmp(&a.priority) {
std::cmp::Ordering::Equal => b.start_line.cmp(&a.start_line),
other => other,
});
let mut selected = Vec::new();
let mut total_tokens = 0;
for chunk in sorted {
if total_tokens + chunk.tokens <= max_tokens {
selected.push(chunk.clone());
total_tokens += chunk.tokens;
}
}
selected.sort_by_key(|c| c.start_line);
selected
}
pub fn reassemble(chunks: &[Chunk]) -> String {
if chunks.is_empty() {
return String::new();
}
let mut parts = Vec::new();
let mut last_end: Option<usize> = None;
for chunk in chunks {
if let Some(end) = last_end {
if chunk.start_line > end + 1 {
let gap = chunk.start_line - end - 1;
parts.push(format!("\n[... {} lines omitted ...]\n", gap));
}
}
parts.push(chunk.content.clone());
last_end = Some(chunk.end_line);
}
parts.join("\n")
}
pub fn compress(content: &str, max_tokens: usize, options: Option<ChunkOptions>) -> String {
let chunks = Self::chunk(content, options);
let selected = Self::select_chunks(&chunks, max_tokens);
Self::reassemble(&selected)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_code() {
let content = r#"
fn main() {
println!("Hello, world!");
}
impl Foo {
pub fn new() -> Self {
Self {}
}
}
"#;
assert_eq!(RlmChunker::detect_content_type(content), ContentType::Code);
}
#[test]
fn test_detect_conversation() {
let content = r#"
[User]: Can you help me with this?
[Assistant]: Of course! What do you need?
[User]: I want to implement a feature.
"#;
assert_eq!(
RlmChunker::detect_content_type(content),
ContentType::Conversation
);
}
#[test]
fn test_compress() {
let content = "line\n".repeat(1000);
let compressed = RlmChunker::compress(&content, 100, None);
let tokens = RlmChunker::estimate_tokens(&compressed);
assert!(tokens <= 100 || compressed.contains("[..."));
}
}