use crate::session::{estimate_tokens, Message};
#[derive(Debug, Clone)]
pub struct SemanticChunk {
pub content: String,
pub importance: f64,
pub chunk_type: ChunkType,
pub discourse_relation: DiscourseRelation,
}
#[derive(Debug, Clone, PartialEq)]
pub enum ChunkType {
Critical, Reference, Context, Conversational, }
#[derive(Debug, Clone, PartialEq)]
pub enum DiscourseRelation {
Elaboration, Contrast, Cause, Sequence, Background, None, }
pub fn chunk_messages(messages: &[Message]) -> Vec<SemanticChunk> {
let mut chunks = Vec::new();
for msg in messages.iter() {
if msg.role == "system" {
continue;
}
let segments = split_by_boundaries(&msg.content);
for segment in segments {
if segment.trim().is_empty() {
continue;
}
let chunk_type = classify_chunk(&segment, msg);
let discourse_relation = detect_discourse_relation(&segment);
let importance = calculate_importance(&segment, &chunk_type, &discourse_relation, msg);
chunks.push(SemanticChunk {
content: segment,
importance,
chunk_type,
discourse_relation,
});
}
}
chunks
}
fn split_by_boundaries(text: &str) -> Vec<String> {
let mut segments = Vec::new();
let mut current = String::new();
let mut in_code_block = false;
for line in text.lines() {
if line.trim().starts_with("```") {
if !current.is_empty() {
segments.push(current.clone());
current.clear();
}
in_code_block = !in_code_block;
current.push_str(line);
current.push('\n');
if !in_code_block {
segments.push(current.clone());
current.clear();
}
continue;
}
if in_code_block {
current.push_str(line);
current.push('\n');
continue;
}
if line.trim().is_empty() {
if !current.is_empty() {
segments.push(current.clone());
current.clear();
}
continue;
}
current.push_str(line);
current.push('\n');
}
if !current.is_empty() {
segments.push(current);
}
segments
}
fn classify_chunk(text: &str, msg: &Message) -> ChunkType {
let lower = text.to_lowercase();
if text.contains("error")
|| text.contains("Error")
|| text.contains("failed")
|| text.contains("issue")
|| text.contains("warning")
|| text.contains("Warning")
{
return ChunkType::Critical;
}
if lower.contains("decided")
|| lower.contains("will do")
|| lower.contains("agreed")
|| lower.contains("must")
|| lower.contains("should not")
|| lower.contains("don't")
|| lower.contains("won't")
{
return ChunkType::Critical;
}
if text.contains('?') && msg.role == "user" {
return ChunkType::Critical;
}
if lower.contains("plan(")
|| lower.contains("task:")
|| lower.contains("step ")
|| lower.contains("phase:")
|| lower.contains("todo")
|| lower.contains("next:")
|| lower.contains("task completed")
|| lower.contains("completed:")
|| lower.contains("next task")
|| (lower.contains("task") && lower.contains('/')) || lower.contains("step completed")
|| lower.contains("phase completed")
{
return ChunkType::Critical;
}
if msg.tool_calls.is_some() {
return ChunkType::Critical;
}
if text.contains('/') || text.contains("http") || text.contains("www") {
return ChunkType::Reference;
}
if text.contains("```") || text.contains('`') {
return ChunkType::Reference;
}
if text.chars().filter(|c| c.is_numeric()).count() > 2 {
return ChunkType::Reference;
}
if text.contains('=') && (text.contains("export") || text.contains("ENV")) {
return ChunkType::Reference;
}
if text.contains("fn ")
|| text.contains("def ")
|| text.contains("function ")
|| text.contains("class ")
|| text.contains("impl ")
{
return ChunkType::Reference;
}
let trimmed = lower.trim();
if trimmed.starts_with("ok")
|| trimmed.starts_with("sure")
|| trimmed.starts_with("thanks")
|| trimmed.starts_with("great")
|| trimmed.starts_with("got it")
|| trimmed.starts_with("understood")
|| trimmed.starts_with("yes")
|| trimmed.starts_with("no problem")
{
return ChunkType::Conversational;
}
ChunkType::Context
}
fn detect_discourse_relation(text: &str) -> DiscourseRelation {
let lower = text.to_lowercase();
if lower.contains("for example")
|| lower.contains("specifically")
|| lower.contains("in particular")
|| lower.contains("such as")
|| lower.contains("i.e.")
|| lower.contains("e.g.")
{
return DiscourseRelation::Elaboration;
}
if lower.contains("however")
|| lower.contains("but ")
|| lower.contains("although")
|| lower.contains("on the other hand")
|| lower.contains("instead")
|| lower.contains("rather than")
{
return DiscourseRelation::Contrast;
}
if lower.contains("because")
|| lower.contains("therefore")
|| lower.contains("thus")
|| lower.contains("so ")
|| lower.contains("as a result")
|| lower.contains("consequently")
{
return DiscourseRelation::Cause;
}
if lower.contains("first")
|| lower.contains("then")
|| lower.contains("next")
|| lower.contains("after")
|| lower.contains("finally")
|| lower.contains("step ")
{
return DiscourseRelation::Sequence;
}
if lower.contains("background")
|| lower.contains("context")
|| lower.contains("historically")
|| lower.contains("previously")
|| lower.contains("as mentioned")
{
return DiscourseRelation::Background;
}
DiscourseRelation::None
}
fn calculate_importance(
text: &str,
chunk_type: &ChunkType,
discourse_relation: &DiscourseRelation,
msg: &Message,
) -> f64 {
let mut score = match chunk_type {
ChunkType::Critical => 10.0, ChunkType::Reference => 7.0, ChunkType::Context => 4.0, ChunkType::Conversational => 1.0, };
if msg.tool_calls.is_some() {
score += 5.0;
}
if msg.role == "user" {
score += 2.0;
}
if text.contains('?') {
score += 1.5;
}
match discourse_relation {
DiscourseRelation::Contrast | DiscourseRelation::Cause => {
score += 2.0; }
DiscourseRelation::Elaboration | DiscourseRelation::Background => {
score -= 1.0; }
DiscourseRelation::Sequence => {
}
DiscourseRelation::None => {}
}
let age_hours = calculate_age_hours(msg.timestamp);
let half_life = match chunk_type {
ChunkType::Critical => 72.0, ChunkType::Reference => 48.0, ChunkType::Context => 24.0, ChunkType::Conversational => 6.0, };
score *= (-age_hours / half_life).exp();
if age_hours < 2.0 {
score *= 1.5;
}
if matches!(discourse_relation, DiscourseRelation::Sequence) && age_hours > 12.0 {
score *= 0.7;
}
score
}
fn calculate_age_hours(timestamp: u64) -> f64 {
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let age_seconds = now.saturating_sub(timestamp);
age_seconds as f64 / 3600.0
}
pub fn select_chunks_within_budget(
chunks: &[SemanticChunk],
target_tokens: usize,
) -> Vec<SemanticChunk> {
let mut sorted = chunks.to_vec();
sorted.sort_by(|a, b| {
b.importance
.partial_cmp(&a.importance)
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut selected = Vec::new();
let mut total_tokens = 0;
let mut selected_indices = std::collections::HashSet::new();
for (idx, chunk) in sorted.iter().enumerate() {
if selected_indices.contains(&idx) {
continue;
}
let chunk_tokens = estimate_tokens(&chunk.content);
let needs_context = matches!(
chunk.discourse_relation,
DiscourseRelation::Contrast | DiscourseRelation::Cause
);
if needs_context && idx > 0 {
let prev_idx = idx - 1;
if !selected_indices.contains(&prev_idx) {
let prev_chunk = &sorted[prev_idx];
let prev_tokens = estimate_tokens(&prev_chunk.content);
if total_tokens + chunk_tokens + prev_tokens <= target_tokens {
selected.push(prev_chunk.clone());
selected_indices.insert(prev_idx);
total_tokens += prev_tokens;
selected.push(chunk.clone());
selected_indices.insert(idx);
total_tokens += chunk_tokens;
continue;
}
}
}
if total_tokens + chunk_tokens <= target_tokens {
selected.push(chunk.clone());
selected_indices.insert(idx);
total_tokens += chunk_tokens;
}
}
compress_sequences(&selected)
}
fn compress_sequences(chunks: &[SemanticChunk]) -> Vec<SemanticChunk> {
let mut result = Vec::new();
let mut sequence_buffer = Vec::new();
for chunk in chunks {
if matches!(chunk.discourse_relation, DiscourseRelation::Sequence) {
sequence_buffer.push(chunk.clone());
} else {
if !sequence_buffer.is_empty() {
if let Some(last) = sequence_buffer.last() {
result.push(last.clone());
}
sequence_buffer.clear();
}
result.push(chunk.clone());
}
}
if let Some(last) = sequence_buffer.last() {
result.push(last.clone());
}
result
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_message(role: &str, content: &str) -> Message {
Message {
role: role.to_string(),
content: content.to_string(),
timestamp: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
cached: false,
tool_call_id: None,
name: None,
tool_calls: None,
images: None,
videos: None,
thinking: None,
id: None,
}
}
#[test]
fn test_classify_critical() {
let msg = create_test_message("assistant", "Error: file not found");
assert_eq!(
classify_chunk("Error: file not found", &msg),
ChunkType::Critical
);
let msg = create_test_message("user", "We decided to use Rust");
assert_eq!(
classify_chunk("We decided to use Rust", &msg),
ChunkType::Critical
);
}
#[test]
fn test_classify_plan_progress_as_critical() {
let msg = create_test_message("assistant", "Task completed: extend chunk classification");
assert_eq!(
classify_chunk("Task completed: extend chunk classification", &msg),
ChunkType::Critical
);
let msg = create_test_message("assistant", "completed: semantic chunking fix");
assert_eq!(
classify_chunk("completed: semantic chunking fix", &msg),
ChunkType::Critical
);
let msg = create_test_message("assistant", "NEXT TASK (2/7): Add video support");
assert_eq!(
classify_chunk("NEXT TASK (2/7): Add video support", &msg),
ChunkType::Critical
);
let msg = create_test_message("assistant", "Working on task 3/7");
assert_eq!(
classify_chunk("Working on task 3/7", &msg),
ChunkType::Critical
);
let msg = create_test_message("assistant", "Step completed: analysis");
assert_eq!(
classify_chunk("Step completed: analysis", &msg),
ChunkType::Critical
);
let msg = create_test_message("assistant", "Phase completed: planning");
assert_eq!(
classify_chunk("Phase completed: planning", &msg),
ChunkType::Critical
);
}
#[test]
fn test_classify_reference() {
let msg = create_test_message("assistant", "Check src/main.rs");
assert_eq!(
classify_chunk("Check src/main.rs", &msg),
ChunkType::Reference
);
let msg = create_test_message("assistant", "Visit https://example.com");
assert_eq!(
classify_chunk("Visit https://example.com", &msg),
ChunkType::Reference
);
}
#[test]
fn test_classify_conversational() {
let msg = create_test_message("user", "ok");
assert_eq!(classify_chunk("ok", &msg), ChunkType::Conversational);
let msg = create_test_message("user", "thanks!");
assert_eq!(classify_chunk("thanks!", &msg), ChunkType::Conversational);
}
#[test]
fn test_importance_scoring() {
let msg = create_test_message("user", "This is important");
let score = calculate_importance(
"This is important",
&ChunkType::Critical,
&DiscourseRelation::None,
&msg,
);
assert!(score > 10.0); }
#[test]
fn test_discourse_relations() {
assert_eq!(
detect_discourse_relation("However, we should consider alternatives"),
DiscourseRelation::Contrast
);
assert_eq!(
detect_discourse_relation("Because of this, we need to refactor"),
DiscourseRelation::Cause
);
assert_eq!(
detect_discourse_relation("For example, we can use Rust"),
DiscourseRelation::Elaboration
);
assert_eq!(
detect_discourse_relation("First, we need to setup the environment"),
DiscourseRelation::Sequence
);
}
#[test]
fn test_temporal_decay_by_type() {
let old_timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs()
- (48 * 3600);
let old_msg = Message {
role: "user".to_string(),
content: "Old content".to_string(),
timestamp: old_timestamp,
cached: false,
tool_call_id: None,
name: None,
tool_calls: None,
images: None,
videos: None,
thinking: None,
id: None,
};
let critical_score = calculate_importance(
"Important decision",
&ChunkType::Critical,
&DiscourseRelation::None,
&old_msg,
);
let conversational_score = calculate_importance(
"ok",
&ChunkType::Conversational,
&DiscourseRelation::None,
&old_msg,
);
assert!(critical_score > conversational_score);
}
#[test]
fn test_chunk_selection() {
let chunks = vec![
SemanticChunk {
content: "Critical info".to_string(),
importance: 10.0,
chunk_type: ChunkType::Critical,
discourse_relation: DiscourseRelation::None,
},
SemanticChunk {
content: "Context info".to_string(),
importance: 5.0,
chunk_type: ChunkType::Context,
discourse_relation: DiscourseRelation::None,
},
];
let selected = select_chunks_within_budget(&chunks, 5);
assert!(!selected.is_empty());
assert_eq!(selected[0].importance, 10.0);
}
#[test]
fn test_relation_aware_selection_preserves_pairs() {
let chunks = vec![
SemanticChunk {
content: "We tried approach A".to_string(),
importance: 5.0,
chunk_type: ChunkType::Context,
discourse_relation: DiscourseRelation::None,
},
SemanticChunk {
content: "However, approach B is better".to_string(),
importance: 8.0, chunk_type: ChunkType::Context,
discourse_relation: DiscourseRelation::Contrast,
},
SemanticChunk {
content: "Unrelated info".to_string(),
importance: 3.0,
chunk_type: ChunkType::Context,
discourse_relation: DiscourseRelation::None,
},
];
let selected = select_chunks_within_budget(&chunks, 100);
let has_approach_a = selected.iter().any(|c| c.content.contains("approach A"));
let has_approach_b = selected.iter().any(|c| c.content.contains("approach B"));
assert!(
has_approach_a && has_approach_b,
"Contrast pair should be preserved together"
);
}
#[test]
fn test_sequence_compression() {
let chunks = vec![
SemanticChunk {
content: "First, we setup the environment".to_string(),
importance: 7.0,
chunk_type: ChunkType::Context,
discourse_relation: DiscourseRelation::Sequence,
},
SemanticChunk {
content: "Then, we installed dependencies".to_string(),
importance: 7.0,
chunk_type: ChunkType::Context,
discourse_relation: DiscourseRelation::Sequence,
},
SemanticChunk {
content: "Finally, we ran the tests".to_string(),
importance: 7.0,
chunk_type: ChunkType::Context,
discourse_relation: DiscourseRelation::Sequence,
},
SemanticChunk {
content: "Critical result: all tests passed".to_string(),
importance: 10.0,
chunk_type: ChunkType::Critical,
discourse_relation: DiscourseRelation::None,
},
];
let selected = select_chunks_within_budget(&chunks, 200);
let sequence_chunks: Vec<_> = selected
.iter()
.filter(|c| matches!(c.discourse_relation, DiscourseRelation::Sequence))
.collect();
assert_eq!(
sequence_chunks.len(),
1,
"Sequences should be compressed to last step only"
);
assert!(
sequence_chunks[0].content.contains("Finally"),
"Should keep the final step"
);
}
#[test]
fn test_discourse_relation_importance_boost() {
let msg = create_test_message("user", "Test");
let cause_score = calculate_importance(
"Because of this issue",
&ChunkType::Context,
&DiscourseRelation::Cause,
&msg,
);
let contrast_score = calculate_importance(
"However, we can try this",
&ChunkType::Context,
&DiscourseRelation::Contrast,
&msg,
);
let none_score = calculate_importance(
"Some context",
&ChunkType::Context,
&DiscourseRelation::None,
&msg,
);
assert!(
cause_score > none_score,
"Cause relation should boost importance"
);
assert!(
contrast_score > none_score,
"Contrast relation should boost importance"
);
}
#[test]
fn test_elaboration_importance_penalty() {
let msg = create_test_message("user", "Test");
let elaboration_score = calculate_importance(
"For example, we can use this",
&ChunkType::Context,
&DiscourseRelation::Elaboration,
&msg,
);
let background_score = calculate_importance(
"Background: this was done before",
&ChunkType::Context,
&DiscourseRelation::Background,
&msg,
);
let none_score = calculate_importance(
"Some context",
&ChunkType::Context,
&DiscourseRelation::None,
&msg,
);
assert!(
elaboration_score < none_score,
"Elaboration relation should reduce importance"
);
assert!(
background_score < none_score,
"Background relation should reduce importance"
);
}
}