#[derive(Debug, Clone)]
pub struct TextChunk {
pub line_start: usize,
pub line_end: usize,
pub text: String,
}
pub trait Chunker: Send + Sync {
fn chunk(&self, title: &str, content: &str) -> Vec<TextChunk>;
}
fn normalise(s: &str) -> String {
s.replace("\n\n", "\n").trim().to_owned()
}
#[inline]
fn count_newlines(s: &str) -> usize {
s.bytes().filter(|&b| b == b'\n').count()
}
pub struct MarkdownChunker;
impl Chunker for MarkdownChunker {
fn chunk(&self, title: &str, content: &str) -> Vec<TextChunk> {
let mut chunks: Vec<TextChunk> = Vec::new();
let mut abs_line: usize = 0;
let mut remaining = content;
loop {
match remaining.find("\n\n") {
Some(sep_pos) => {
let part = &remaining[..sep_pos];
push_para_chunk(&mut chunks, part, abs_line);
abs_line += count_newlines(part) + 2;
remaining = &remaining[sep_pos + 2..];
}
None => {
push_para_chunk(&mut chunks, remaining, abs_line);
break;
}
}
}
if chunks.is_empty() {
vec![TextChunk {
line_start: 0,
line_end: 0,
text: title.to_owned(),
}]
} else {
chunks
}
}
}
fn push_para_chunk(chunks: &mut Vec<TextChunk>, part: &str, abs_line: usize) {
let trimmed = part.trim();
if trimmed.is_empty() {
return;
}
let leading = &part[..part.len() - part.trim_start().len()];
let leading_newlines = count_newlines(leading);
let line_start = abs_line + leading_newlines;
let line_end = line_start + count_newlines(trimmed);
chunks.push(TextChunk {
line_start,
line_end,
text: trimmed.to_owned(),
});
}
pub struct JsonChunker;
impl Chunker for JsonChunker {
fn chunk(&self, _title: &str, content: &str) -> Vec<TextChunk> {
if let Some(chunks) = try_parse_jsonl(content) {
return chunks;
}
match serde_json::from_str::<serde_json::Value>(content) {
Ok(value) => chunks_from_json_value(content, &value),
Err(_) => {
vec![TextChunk {
line_start: 0,
line_end: count_newlines(content),
text: normalise(content),
}]
}
}
}
}
fn try_parse_jsonl(content: &str) -> Option<Vec<TextChunk>> {
let non_empty_lines: Vec<(usize, &str)> = content
.lines()
.enumerate()
.filter(|(_, l)| !l.trim().is_empty())
.collect();
if non_empty_lines.len() < 2 {
return None;
}
let mut chunks = Vec::with_capacity(non_empty_lines.len());
for (line_idx, line_text) in &non_empty_lines {
match serde_json::from_str::<serde_json::Value>(line_text) {
Ok(v) => chunks.push(TextChunk {
line_start: *line_idx,
line_end: *line_idx,
text: extract_message_text(&v),
}),
Err(_) => return None,
}
}
Some(chunks)
}
fn chunks_from_json_value(raw: &str, value: &serde_json::Value) -> Vec<TextChunk> {
match value {
serde_json::Value::Array(arr) => {
let positions = find_array_element_lines(raw);
arr.iter()
.enumerate()
.map(|(i, v)| {
let (line_start, line_end) = positions.get(i).copied().unwrap_or((i, i));
TextChunk {
line_start,
line_end,
text: extract_message_text(v),
}
})
.collect()
}
serde_json::Value::Object(map) => {
const ARRAY_KEYS: &[&str] = &["messages", "chat", "history", "log"];
for key in ARRAY_KEYS {
if let Some(serde_json::Value::Array(arr)) = map.get(*key)
&& !arr.is_empty()
{
let nested_positions = find_nested_array_lines(raw, key);
return arr
.iter()
.enumerate()
.map(|(i, v)| {
let (line_start, line_end) =
nested_positions.get(i).copied().unwrap_or((i, i));
TextChunk {
line_start,
line_end,
text: extract_message_text(v),
}
})
.collect();
}
}
vec![TextChunk {
line_start: 0,
line_end: count_newlines(raw),
text: extract_message_text(value),
}]
}
other => vec![TextChunk {
line_start: 0,
line_end: count_newlines(raw),
text: normalise(&other.to_string()),
}],
}
}
fn find_array_element_lines(raw: &str) -> Vec<(usize, usize)> {
let start = match raw.find('[') {
Some(i) => i + 1,
None => return vec![],
};
scan_array_element_lines(raw, start)
}
fn find_nested_array_lines(raw: &str, key: &str) -> Vec<(usize, usize)> {
let needle = format!("\"{key}\"");
let key_pos = match raw.find(&needle) {
Some(p) => p + needle.len(),
None => return vec![],
};
let after_key = &raw[key_pos..];
let bracket_offset = match after_key.find('[') {
Some(i) => i + 1,
None => return vec![],
};
let abs_start = key_pos + bracket_offset;
scan_array_element_lines(raw, abs_start)
}
fn scan_array_element_lines(raw: &str, start: usize) -> Vec<(usize, usize)> {
let mut ranges: Vec<(usize, usize)> = Vec::new();
let bytes = raw.as_bytes();
let len = bytes.len();
let prefix = &raw[..start.min(len)];
let mut line = count_newlines(prefix);
let mut depth: i32 = 0;
let mut in_string = false;
let mut escaped = false;
let mut i = start.min(len);
let mut current_start: Option<usize> = None;
while i < len {
let b = bytes[i];
if escaped {
escaped = false;
if b == b'\n' {
line += 1;
}
i += 1;
continue;
}
if in_string {
if b == b'\\' {
escaped = true;
} else if b == b'"' {
in_string = false;
if depth == 0 && current_start.is_some() {
let s = current_start.take().unwrap();
ranges.push((s, line));
}
}
if b == b'\n' {
line += 1;
}
i += 1;
continue;
}
match b {
b'"' => {
in_string = true;
if depth == 0 && current_start.is_none() {
current_start = Some(line);
}
}
b'{' | b'[' => {
if depth == 0 && current_start.is_none() {
current_start = Some(line);
}
depth += 1;
}
b'}' | b']' => {
depth -= 1;
if depth < 0 {
break;
}
if depth == 0 && current_start.is_some() {
let s = current_start.take().unwrap();
ranges.push((s, line));
}
}
b',' if depth == 0 => {
if let Some(s) = current_start.take() {
ranges.push((s, line));
}
}
b'0'..=b'9' | b'-' | b't' | b'f' | b'n' if depth == 0 => {
if current_start.is_none() {
current_start = Some(line);
}
}
b'\n' => {
line += 1;
}
_ => {}
}
i += 1;
}
if let Some(s) = current_start.take() {
ranges.push((s, line));
}
ranges
}
fn extract_message_text(value: &serde_json::Value) -> String {
let obj = match value.as_object() {
Some(o) => o,
None => return normalise(&value.to_string()),
};
const CONTENT_KEYS: &[&str] = &["mes", "content", "message", "text"];
let content = CONTENT_KEYS
.iter()
.find_map(|k| obj.get(*k)?.as_str())
.map(str::to_owned);
const ROLE_KEYS: &[&str] = &["name", "role", "speaker", "author"];
let role = ROLE_KEYS
.iter()
.find_map(|k| obj.get(*k)?.as_str())
.filter(|s| !s.is_empty())
.map(str::to_owned);
match (role, content) {
(Some(r), Some(c)) => normalise(&format!("{r}: {c}")),
(None, Some(c)) => normalise(&c),
_ => {
let fallback = obj
.iter()
.filter_map(|(k, v)| v.as_str().map(|s| format!("{k}: {s}")))
.collect::<Vec<_>>()
.join("\n");
if fallback.is_empty() {
normalise(&serde_json::to_string(value).unwrap_or_default())
} else {
normalise(&fallback)
}
}
}
}
pub fn chunk_document(body: &str) -> Vec<String> {
let paragraphs: Vec<&str> = body
.split("\n\n")
.map(str::trim)
.filter(|s| !s.is_empty())
.collect();
if paragraphs.is_empty() {
return Vec::new();
}
paragraphs.iter().map(|p| p.to_string()).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn two_paragraphs() {
let chunks = chunk_document("First.\n\nSecond.");
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0], "First.");
assert_eq!(chunks[1], "Second.");
}
#[test]
fn empty_body_returns_empty() {
let chunks = chunk_document("");
assert!(chunks.is_empty());
}
#[test]
fn blank_only_body_returns_empty() {
let chunks = chunk_document(" \n\n ");
assert!(chunks.is_empty());
}
#[test]
fn single_paragraph() {
let chunks = chunk_document("Only body.");
assert_eq!(chunks, vec!["Only body."]);
}
#[test]
fn markdown_line_range_single_line() {
let c = MarkdownChunker;
let chunks = c.chunk("Title", "First.\n\nSecond.");
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 0);
assert_eq!(chunks[0].text, "First.");
assert_eq!(chunks[1].line_start, 2);
assert_eq!(chunks[1].line_end, 2);
assert_eq!(chunks[1].text, "Second.");
}
#[test]
fn markdown_multiline_paragraph() {
let c = MarkdownChunker;
let chunks = c.chunk("T", "A\nB\nC\n\nD");
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 2);
assert_eq!(chunks[0].text, "A\nB\nC");
assert_eq!(chunks[1].line_start, 4);
assert_eq!(chunks[1].line_end, 4);
}
#[test]
fn markdown_leading_blank_lines() {
let c = MarkdownChunker;
let chunks = c.chunk("T", "\n\nActual paragraph.");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].line_start, 2);
assert_eq!(chunks[0].line_end, 2);
}
#[test]
fn markdown_empty_body_returns_title() {
let c = MarkdownChunker;
let chunks = c.chunk("Title", "");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 0);
assert_eq!(chunks[0].text, "Title");
}
#[test]
fn jsonl_line_ranges() {
let c = JsonChunker;
let jsonl = concat!(
"{\"name\":\"User\",\"is_user\":true,\"mes\":\"Hello there\"}\n",
"{\"name\":\"Aria\",\"is_user\":false,\"mes\":\"Hi! How can I help?\"}"
);
let chunks = c.chunk("chat.jsonl", jsonl);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 0);
assert_eq!(chunks[1].line_start, 1);
assert_eq!(chunks[1].line_end, 1);
assert!(chunks[0].text.contains("Hello there"));
assert!(chunks[1].text.contains("Hi! How can I help?"));
}
#[test]
fn json_openai_messages() {
let c = JsonChunker;
let json = "{\n \"messages\": [\n {\"role\":\"user\",\"content\":\"What is 2+2?\"},\n {\"role\":\"assistant\",\"content\":\"4\"}\n ]\n}";
let chunks = c.chunk("session.json", json);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line_start, chunks[0].line_end);
assert_eq!(chunks[1].line_start, chunks[1].line_end);
assert!(chunks[0].line_start >= 2);
assert!(chunks[1].line_start > chunks[0].line_start);
}
#[test]
fn json_array() {
let c = JsonChunker;
let json = "[\n {\"role\":\"user\",\"content\":\"Ping\"},\n {\"role\":\"assistant\",\"content\":\"Pong\"}\n]";
let chunks = c.chunk("msgs.json", json);
assert_eq!(chunks.len(), 2);
assert!(chunks[0].text.contains("Ping"));
assert!(chunks[1].text.contains("Pong"));
assert!(chunks[0].line_start >= 1);
assert_eq!(chunks[0].line_start, chunks[0].line_end);
}
#[test]
fn json_array_multiline_element() {
let c = JsonChunker;
let json = "[\n {\n \"role\": \"user\",\n \"content\": \"Hello\"\n }\n]";
let chunks = c.chunk("msgs.json", json);
assert_eq!(chunks.len(), 1);
assert!(chunks[0].line_start >= 1);
assert!(
chunks[0].line_end > chunks[0].line_start,
"expected multi-line range, got {}..={}",
chunks[0].line_start,
chunks[0].line_end
);
}
#[test]
fn json_single_object() {
let c = JsonChunker;
let json = "{\"role\":\"user\",\"content\":\"Just one message\"}";
let chunks = c.chunk("single.json", json);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].line_start, 0);
assert_eq!(chunks[0].line_end, 0);
assert!(chunks[0].text.contains("Just one message"));
}
#[test]
fn text_has_no_double_newline() {
let c = JsonChunker;
let jsonl = concat!(
"{\"role\":\"user\",\"content\":\"Line one\\n\\nLine two\"}\n",
"{\"role\":\"assistant\",\"content\":\"OK\"}"
);
let chunks = c.chunk("chat.jsonl", jsonl);
for ch in &chunks {
assert!(!ch.text.contains("\n\n"));
}
}
}