pub(crate) use super::produce_split::split_by_token_budget;
use super::types::{approx_token_count, chunk_id, Chunk, Metadata, SourceKind};
pub const DEFAULT_CHUNK_MAX_TOKENS: u32 = 3_000;
#[derive(Clone, Debug)]
pub struct ChunkerOptions {
pub max_tokens: u32,
}
impl Default for ChunkerOptions {
fn default() -> Self {
Self {
max_tokens: DEFAULT_CHUNK_MAX_TOKENS,
}
}
}
#[derive(Clone, Debug)]
pub struct ChunkerInput {
pub source_kind: SourceKind,
pub source_id: String,
pub markdown: String,
pub metadata: Metadata,
}
pub fn chunk_markdown(input: &ChunkerInput, opts: &ChunkerOptions) -> Vec<Chunk> {
let now = chrono::Utc::now();
let max_tokens = opts.max_tokens.max(1);
let max_chars = (max_tokens as usize).saturating_mul(4);
let units: Vec<String> = match input.source_kind {
SourceKind::Chat => split_chat_messages(&input.markdown),
SourceKind::Email => split_email_messages(&input.markdown),
SourceKind::Document => split_by_token_budget(&input.markdown, max_tokens),
};
if matches!(input.source_kind, SourceKind::Document) {
return units
.into_iter()
.enumerate()
.map(|(idx, content)| {
let seq = idx as u32;
let token_count = approx_token_count(&content);
let id = chunk_id(input.source_kind, &input.source_id, seq, &content);
Chunk {
id,
content,
metadata: input.metadata.clone(),
token_count,
seq_in_source: seq,
created_at: now,
partial_message: false,
}
})
.collect();
}
let unit_separator = "\n\n";
let sep_chars = unit_separator.chars().count();
let mut out: Vec<Chunk> = Vec::new();
let mut acc: Vec<String> = Vec::new();
let mut acc_chars = 0usize;
let flush = |acc: &mut Vec<String>, acc_chars: &mut usize, out: &mut Vec<Chunk>| {
if acc.is_empty() {
return;
}
let content = acc.join(unit_separator);
let seq = out.len() as u32;
let tc = approx_token_count(&content);
let id = chunk_id(input.source_kind, &input.source_id, seq, &content);
out.push(Chunk {
id,
content,
metadata: input.metadata.clone(),
token_count: tc,
seq_in_source: seq,
created_at: now,
partial_message: false,
});
acc.clear();
*acc_chars = 0;
};
for unit in units {
let unit_chars = unit.chars().count();
if unit_chars > max_chars {
flush(&mut acc, &mut acc_chars, &mut out);
let sub_pieces = split_by_token_budget(&unit, max_tokens);
for piece in sub_pieces {
let seq = out.len() as u32;
let tc = approx_token_count(&piece);
let id = chunk_id(input.source_kind, &input.source_id, seq, &piece);
out.push(Chunk {
id,
content: piece,
metadata: input.metadata.clone(),
token_count: tc,
seq_in_source: seq,
created_at: now,
partial_message: true,
});
}
continue;
}
let projected = if acc.is_empty() {
unit_chars
} else {
acc_chars + sep_chars + unit_chars
};
if projected > max_chars {
flush(&mut acc, &mut acc_chars, &mut out);
}
if !acc.is_empty() {
acc_chars += sep_chars;
}
acc_chars += unit_chars;
acc.push(unit);
}
flush(&mut acc, &mut acc_chars, &mut out);
if out.is_empty() {
let id = chunk_id(input.source_kind, &input.source_id, 0, "");
out.push(Chunk {
id,
content: String::new(),
metadata: input.metadata.clone(),
token_count: 0,
seq_in_source: 0,
created_at: now,
partial_message: false,
});
}
out
}
fn split_chat_messages(md: &str) -> Vec<String> {
let mut pieces: Vec<String> = Vec::new();
let mut current: Option<String> = None;
for line in md.split_inclusive('\n') {
if line.starts_with("## ") {
if let Some(prev) = current.take() {
let trimmed = prev.trim_end().to_string();
if !trimmed.is_empty() {
pieces.push(trimmed);
}
}
current = Some(line.to_string());
} else if let Some(ref mut buf) = current {
buf.push_str(line);
}
}
if let Some(prev) = current.take() {
let trimmed = prev.trim_end().to_string();
if !trimmed.is_empty() {
pieces.push(trimmed);
}
}
if pieces.is_empty() && !md.trim().is_empty() {
pieces.push(md.trim_end().to_string());
}
pieces
}
fn split_email_messages(md: &str) -> Vec<String> {
let lines: Vec<&str> = md.split('\n').collect();
let n = lines.len();
let mut split_positions: Vec<usize> = Vec::new();
for i in 0..n {
let line = lines[i].trim_end();
if line == "---" {
let window_end = (i + 9).min(n);
for j in (i + 1)..window_end {
if lines[j].starts_with("From:") {
split_positions.push(i);
break;
}
if !lines[j].trim().is_empty() {
break;
}
}
}
}
if split_positions.is_empty() {
let trimmed = md.trim_end().to_string();
if trimmed.is_empty() {
return Vec::new();
}
return vec![trimmed];
}
let mut pieces: Vec<String> = Vec::new();
for (idx, &start) in split_positions.iter().enumerate() {
let end = if idx + 1 < split_positions.len() {
split_positions[idx + 1]
} else {
n
};
let piece_lines: Vec<&str> = lines[start..end].iter().copied().collect();
let piece = piece_lines.join("\n").trim_end().to_string();
if !piece.is_empty() {
pieces.push(piece);
}
}
pieces
}
#[cfg(test)]
#[path = "produce_tests.rs"]
mod tests;