#[derive(Debug, Clone)]
pub struct SessionChunk {
pub text: String,
pub turn_count: usize,
}
pub fn chunk_session<'a>(
turns: impl Iterator<Item = (&'a str, &'a str)>,
session_date: &str,
target_chars: usize,
min_turn_chars: usize,
) -> Vec<SessionChunk> {
let mut chunks = Vec::new();
let mut current_text = String::new();
let mut current_turns = 0usize;
let date_prefix = if session_date.is_empty() {
String::new()
} else {
format!("[Session from {session_date}]\n")
};
for (role, content) in turns {
if content.trim().len() < min_turn_chars {
continue;
}
let line = format!("{role}: {content}\n");
if line.len() > target_chars * 2 {
if !current_text.is_empty() {
chunks.push(SessionChunk {
text: format!("{date_prefix}{current_text}"),
turn_count: current_turns,
});
current_text.clear();
current_turns = 0;
}
let mut remaining = line.as_str();
while !remaining.is_empty() {
let split_at = if remaining.len() <= target_chars {
remaining.len()
} else {
let search_range = &remaining[..target_chars.min(remaining.len())];
search_range
.rfind('\n')
.map(|p| p + 1)
.or_else(|| search_range.rfind(' ').map(|p| p + 1))
.unwrap_or(target_chars.min(remaining.len()))
};
let chunk_text = &remaining[..split_at];
if chunk_text.trim().len() >= min_turn_chars {
chunks.push(SessionChunk {
text: format!("{date_prefix}{chunk_text}"),
turn_count: 1,
});
}
remaining = &remaining[split_at..];
}
continue;
}
if !current_text.is_empty() && current_text.len() + line.len() > target_chars {
chunks.push(SessionChunk {
text: format!("{date_prefix}{current_text}"),
turn_count: current_turns,
});
let overlap_size = 100.min(current_text.len());
let mut overlap_start = current_text.len() - overlap_size;
while overlap_start > 0 && !current_text.is_char_boundary(overlap_start) {
overlap_start += 1;
}
let overlap_pos = current_text[overlap_start..]
.find(' ')
.map(|p| overlap_start + p + 1)
.unwrap_or(overlap_start);
let overlap = current_text[overlap_pos..].to_string();
current_text.clear();
current_text.push_str(&overlap);
current_turns = 0; }
current_text.push_str(&line);
current_turns += 1;
}
if !current_text.is_empty() {
chunks.push(SessionChunk {
text: format!("{date_prefix}{current_text}"),
turn_count: current_turns,
});
}
chunks
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn basic_chunking() {
let turns = [
("User", "What is the weather like today in San Francisco?"),
(
"Assistant",
"The weather in San Francisco today is partly cloudy with temperatures around 62°F.",
),
("User", "Thanks"), ("User", "What about tomorrow?"),
(
"Assistant",
"Tomorrow is expected to be sunny with highs near 68°F and light winds.",
),
];
let chunks = chunk_session(turns.iter().map(|(r, c)| (*r, *c)), "2024/01/15", 500, 10);
let total_turns: usize = chunks.iter().map(|c| c.turn_count).sum();
assert_eq!(total_turns, 4, "should exclude 'Thanks' turn");
for chunk in &chunks {
assert!(chunk.text.contains("[Session from 2024/01/15]"));
}
}
#[test]
fn respects_target_size() {
let turns: Vec<(&str, &str)> = (0..20)
.map(|_| ("User", "This is a moderately long turn that contains enough text to be meaningful for embedding purposes and search quality."))
.collect();
let chunks = chunk_session(turns.iter().map(|(r, c)| (*r, *c)), "2024/03/01", 300, 10);
assert!(chunks.len() > 1, "should split into multiple chunks");
for chunk in &chunks {
assert!(
chunk.text.len() < 600,
"chunk too large: {} chars",
chunk.text.len()
);
}
}
#[test]
fn filters_short_turns() {
let turns = [
("User", "Ok"),
("Assistant", "Sure"),
("User", "Hmm"),
(
"User",
"What is the capital of France and why is it important?",
),
];
let chunks = chunk_session(turns.iter().map(|(r, c)| (*r, *c)), "", 500, 10);
assert_eq!(chunks.len(), 1);
assert_eq!(
chunks[0].turn_count, 1,
"only one turn should survive filtering"
);
assert!(chunks[0].text.contains("capital of France"));
}
#[test]
fn empty_session() {
let turns: Vec<(&str, &str)> = vec![];
let chunks = chunk_session(turns.into_iter(), "2024/01/01", 500, 10);
assert!(chunks.is_empty());
}
#[test]
fn empty_date() {
let turns = [("User", "This is a test message with enough content")];
let chunks = chunk_session(turns.iter().map(|(r, c)| (*r, *c)), "", 500, 10);
assert_eq!(chunks.len(), 1);
assert!(!chunks[0].text.contains("[Session from"));
}
}