#[derive(Debug, Clone)]
pub struct TextChunk {
pub line_start: usize,
pub line_end: usize,
pub text: String,
}
pub trait Chunker: Send + Sync {
fn chunk(&self, title: &str, content: &str) -> Vec<TextChunk>;
}
fn normalise(s: &str) -> String {
s.replace("\n\n", "\n").trim().to_owned()
}
#[inline]
fn count_newlines(s: &str) -> usize {
s.bytes().filter(|&b| b == b'\n').count()
}
pub struct MarkdownChunker;
impl Chunker for MarkdownChunker {
fn chunk(&self, title: &str, content: &str) -> Vec<TextChunk> {
let mut chunks: Vec<TextChunk> = Vec::new();
let mut abs_line: usize = 0;
let mut remaining = content;
loop {
match remaining.find("\n\n") {
Some(sep_pos) => {
let part = &remaining[..sep_pos];
push_para_chunk(&mut chunks, part, abs_line);
abs_line += count_newlines(part) + 2;
remaining = &remaining[sep_pos + 2..];
}
None => {
push_para_chunk(&mut chunks, remaining, abs_line);
break;
}
}
}
if chunks.is_empty() {
vec![TextChunk {
line_start: 0,
line_end: 0,
text: title.to_owned(),
}]
} else {
chunks
}
}
}
fn push_para_chunk(chunks: &mut Vec<TextChunk>, part: &str, abs_line: usize) {
let trimmed = part.trim();
if trimmed.is_empty() {
return;
}
let leading = &part[..part.len() - part.trim_start().len()];
let leading_newlines = count_newlines(leading);
let line_start = abs_line + leading_newlines;
let line_end = line_start + count_newlines(trimmed);
chunks.push(TextChunk {
line_start,
line_end,
text: trimmed.to_owned(),
});
}
pub struct JsonlChunker;
impl Chunker for JsonlChunker {
fn chunk(&self, title: &str, content: &str) -> Vec<TextChunk> {
let mut chunks: Vec<TextChunk> = Vec::new();
for (line_idx, line_text) in content.lines().enumerate() {
if line_text.trim().is_empty() {
continue;
}
let text = match serde_json::from_str::<serde_json::Value>(line_text) {
Ok(v) => extract_message_text(&v),
Err(_) => normalise(line_text),
};
if text.is_empty() {
continue;
}
chunks.push(TextChunk {
line_start: line_idx,
line_end: line_idx,
text,
});
}
if chunks.is_empty() {
vec![TextChunk {
line_start: 0,
line_end: 0,
text: title.to_owned(),
}]
} else {
chunks
}
}
}
fn extract_message_text(value: &serde_json::Value) -> String {
let obj = match value.as_object() {
Some(o) => o,
None => return normalise(&value.to_string()),
};
const CONTENT_KEYS: &[&str] = &["mes", "content", "message", "text"];
let content = CONTENT_KEYS
.iter()
.find_map(|k| obj.get(*k)?.as_str())
.map(str::to_owned);
const ROLE_KEYS: &[&str] = &["name", "role", "speaker", "author"];
let role = ROLE_KEYS
.iter()
.find_map(|k| obj.get(*k)?.as_str())
.filter(|s| !s.is_empty())
.map(str::to_owned);
match (role, content) {
(Some(r), Some(c)) => normalise(&format!("{r}: {c}")),
(None, Some(c)) => normalise(&c),
_ => {
let fallback = obj
.iter()
.filter_map(|(k, v)| v.as_str().map(|s| format!("{k}: {s}")))
.collect::<Vec<_>>()
.join("\n");
if fallback.is_empty() {
normalise(&serde_json::to_string(value).unwrap_or_default())
} else {
normalise(&fallback)
}
}
}
}
pub fn chunk_document(body: &str) -> Vec<String> {
let paragraphs: Vec<&str> = body
.split("\n\n")
.map(str::trim)
.filter(|s| !s.is_empty())
.collect();
if paragraphs.is_empty() {
return Vec::new();
}
paragraphs.iter().map(|p| p.to_string()).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn two_paragraphs() {
let chunks = chunk_document("First.\n\nSecond.");
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0], "First.");
assert_eq!(chunks[1], "Second.");
}
#[test]
fn empty_body_returns_empty() {
let chunks = chunk_document("");
assert!(chunks.is_empty());
}
#[test]
fn blank_only_body_returns_empty() {
let chunks = chunk_document(" \n\n ");
assert!(chunks.is_empty());
}
#[test]
fn single_paragraph() {
let chunks = chunk_document("Only body.");
assert_eq!(chunks, vec!["Only body."]);
}
#[test]
fn markdown_line_range_single_line() {
let c = MarkdownChunker;
let chunks = c.chunk("Title", "First.\n\nSecond.");
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 0);
assert_eq!(chunks[0].text, "First.");
assert_eq!(chunks[1].line_start, 2);
assert_eq!(chunks[1].line_end, 2);
assert_eq!(chunks[1].text, "Second.");
}
#[test]
fn markdown_multiline_paragraph() {
let c = MarkdownChunker;
let chunks = c.chunk("T", "A\nB\nC\n\nD");
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 2);
assert_eq!(chunks[0].text, "A\nB\nC");
assert_eq!(chunks[1].line_start, 4);
assert_eq!(chunks[1].line_end, 4);
}
#[test]
fn markdown_leading_blank_lines() {
let c = MarkdownChunker;
let chunks = c.chunk("T", "\n\nActual paragraph.");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].line_start, 2);
assert_eq!(chunks[0].line_end, 2);
}
#[test]
fn markdown_empty_body_returns_title() {
let c = MarkdownChunker;
let chunks = c.chunk("Title", "");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 0);
assert_eq!(chunks[0].text, "Title");
}
#[test]
fn jsonl_line_ranges() {
let c = JsonlChunker;
let jsonl = concat!(
"{\"name\":\"User\",\"is_user\":true,\"mes\":\"Hello there\"}\n",
"{\"name\":\"Aria\",\"is_user\":false,\"mes\":\"Hi! How can I help?\"}"
);
let chunks = c.chunk("chat.jsonl", jsonl);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 0);
assert_eq!(chunks[1].line_start, 1);
assert_eq!(chunks[1].line_end, 1);
assert!(chunks[0].text.contains("Hello there"));
assert!(chunks[1].text.contains("Hi! How can I help?"));
}
#[test]
fn jsonl_single_line() {
let c = JsonlChunker;
let jsonl = "{\"role\":\"user\",\"content\":\"Just one message\"}";
let chunks = c.chunk("single.jsonl", jsonl);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 0);
assert!(chunks[0].text.contains("Just one message"));
}
#[test]
fn jsonl_skips_blank_lines_preserves_indices() {
let c = JsonlChunker;
let jsonl = "{\"content\":\"first\"}\n\n{\"content\":\"third\"}";
let chunks = c.chunk("log.jsonl", jsonl);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[1].line_start, 2);
}
#[test]
fn jsonl_unparseable_line_falls_back_to_raw() {
let c = JsonlChunker;
let jsonl = "{\"content\":\"ok\"}\n{not json";
let chunks = c.chunk("log.jsonl", jsonl);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[1].line_start, 1);
assert!(chunks[1].text.contains("not json"));
}
#[test]
fn jsonl_append_preserves_existing_line_starts() {
let c = JsonlChunker;
let before = "{\"content\":\"a\"}\n{\"content\":\"b\"}";
let after = "{\"content\":\"a\"}\n{\"content\":\"b\"}\n{\"content\":\"c\"}";
let before_chunks = c.chunk("log.jsonl", before);
let after_chunks = c.chunk("log.jsonl", after);
assert_eq!(before_chunks.len(), 2);
assert_eq!(after_chunks.len(), 3);
for (b, a) in before_chunks.iter().zip(after_chunks.iter()) {
assert_eq!(b.line_start, a.line_start);
assert_eq!(b.text, a.text);
}
}
#[test]
fn jsonl_empty_body_returns_title() {
let c = JsonlChunker;
let chunks = c.chunk("empty.jsonl", "");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "empty.jsonl");
}
#[test]
fn jsonl_text_has_no_double_newline() {
let c = JsonlChunker;
let jsonl = concat!(
"{\"role\":\"user\",\"content\":\"Line one\\n\\nLine two\"}\n",
"{\"role\":\"assistant\",\"content\":\"OK\"}"
);
let chunks = c.chunk("chat.jsonl", jsonl);
for ch in &chunks {
assert!(!ch.text.contains("\n\n"));
}
}
}