#[derive(Debug, Clone)]
pub struct TextChunk {
pub line: usize,
pub column: usize,
pub text: String,
}
pub trait Chunker: Send + Sync {
fn chunk(&self, title: &str, content: &str) -> Vec<TextChunk>;
}
fn normalise(s: &str) -> String {
s.replace("\n\n", "\n").trim().to_owned()
}
#[inline]
fn count_newlines(s: &str) -> usize {
s.bytes().filter(|&b| b == b'\n').count()
}
pub struct MarkdownChunker;
impl Chunker for MarkdownChunker {
fn chunk(&self, title: &str, content: &str) -> Vec<TextChunk> {
let mut chunks: Vec<TextChunk> = Vec::new();
let mut abs_line: usize = 0;
let mut remaining = content;
loop {
match remaining.find("\n\n") {
Some(sep_pos) => {
let part = &remaining[..sep_pos];
push_para_chunk(&mut chunks, part, abs_line);
abs_line += count_newlines(part) + 2; remaining = &remaining[sep_pos + 2..];
}
None => {
push_para_chunk(&mut chunks, remaining, abs_line);
break;
}
}
}
if chunks.is_empty() {
vec![TextChunk {
line: 0,
column: 0,
text: title.to_owned(),
}]
} else {
chunks
}
}
}
fn push_para_chunk(chunks: &mut Vec<TextChunk>, part: &str, abs_line: usize) {
let trimmed = part.trim();
if trimmed.is_empty() {
return;
}
let leading = &part[..part.len() - part.trim_start().len()];
let leading_newlines = count_newlines(leading);
chunks.push(TextChunk {
line: abs_line + leading_newlines,
column: 0,
text: trimmed.to_owned(),
});
}
pub struct JsonChunker;
impl Chunker for JsonChunker {
fn chunk(&self, _title: &str, content: &str) -> Vec<TextChunk> {
if let Some(chunks) = try_parse_jsonl(content) {
return chunks;
}
match serde_json::from_str::<serde_json::Value>(content) {
Ok(value) => chunks_from_json_value(content, &value),
Err(_) => {
vec![TextChunk {
line: 0,
column: 0,
text: normalise(content),
}]
}
}
}
}
fn try_parse_jsonl(content: &str) -> Option<Vec<TextChunk>> {
let non_empty_lines: Vec<(usize, &str)> = content
.lines()
.enumerate()
.filter(|(_, l)| !l.trim().is_empty())
.collect();
if non_empty_lines.len() < 2 {
return None;
}
let mut chunks = Vec::with_capacity(non_empty_lines.len());
for (line_idx, line_text) in &non_empty_lines {
match serde_json::from_str::<serde_json::Value>(line_text) {
Ok(v) => chunks.push(TextChunk {
line: *line_idx,
column: 0,
text: extract_message_text(&v),
}),
Err(_) => return None, }
}
Some(chunks)
}
fn chunks_from_json_value(raw: &str, value: &serde_json::Value) -> Vec<TextChunk> {
match value {
serde_json::Value::Array(arr) => {
let positions = find_array_element_positions(raw);
arr.iter()
.enumerate()
.map(|(i, v)| {
let (line, column) = positions.get(i).copied().unwrap_or((i, 0));
TextChunk {
line,
column,
text: extract_message_text(v),
}
})
.collect()
}
serde_json::Value::Object(map) => {
const ARRAY_KEYS: &[&str] = &["messages", "chat", "history", "log"];
for key in ARRAY_KEYS {
if let Some(serde_json::Value::Array(arr)) = map.get(*key)
&& !arr.is_empty()
{
let nested_positions = find_nested_array_positions(raw, key);
return arr
.iter()
.enumerate()
.map(|(i, v)| {
let (line, column) = nested_positions.get(i).copied().unwrap_or((i, 0));
TextChunk {
line,
column,
text: extract_message_text(v),
}
})
.collect();
}
}
vec![TextChunk {
line: 0,
column: 0,
text: extract_message_text(value),
}]
}
other => vec![TextChunk {
line: 0,
column: 0,
text: normalise(&other.to_string()),
}],
}
}
fn find_array_element_positions(raw: &str) -> Vec<(usize, usize)> {
let start = match raw.find('[') {
Some(i) => i + 1,
None => return vec![],
};
scan_array_element_positions(raw, start)
}
fn find_nested_array_positions(raw: &str, key: &str) -> Vec<(usize, usize)> {
let needle = format!("\"{key}\"");
let key_pos = match raw.find(&needle) {
Some(p) => p + needle.len(),
None => return vec![],
};
let after_key = &raw[key_pos..];
let bracket_offset = match after_key.find('[') {
Some(i) => i + 1,
None => return vec![],
};
let abs_start = key_pos + bracket_offset;
scan_array_element_positions(raw, abs_start)
}
fn scan_array_element_positions(raw: &str, start: usize) -> Vec<(usize, usize)> {
let mut positions: Vec<(usize, usize)> = Vec::new();
let bytes = raw.as_bytes();
let len = bytes.len();
let prefix = &raw[..start.min(len)];
let mut line = count_newlines(prefix);
let last_nl = prefix.rfind('\n').map(|p| p + 1).unwrap_or(0);
let mut col = start.min(len).saturating_sub(last_nl);
let mut depth: i32 = 0; let mut in_string = false;
let mut escaped = false;
let mut i = start.min(len);
while i < len {
let b = bytes[i];
if escaped {
escaped = false;
if b == b'\n' {
line += 1;
col = 0;
} else {
col += 1;
}
i += 1;
continue;
}
if in_string {
if b == b'\\' {
escaped = true;
} else if b == b'"' {
in_string = false;
}
if b == b'\n' {
line += 1;
col = 0;
} else {
col += 1;
}
i += 1;
continue;
}
match b {
b'"' => {
in_string = true;
if depth == 0 {
positions.push((line, col));
}
col += 1;
}
b'{' | b'[' => {
if depth == 0 {
positions.push((line, col));
}
depth += 1;
col += 1;
}
b'}' | b']' => {
depth -= 1;
if depth < 0 {
break;
}
col += 1;
}
b'\n' => {
line += 1;
col = 0;
}
b'0'..=b'9' | b'-' | b't' | b'f' | b'n' if depth == 0 => {
positions.push((line, col));
col += 1;
}
_ => {
col += 1;
}
}
i += 1;
}
positions
}
fn extract_message_text(value: &serde_json::Value) -> String {
let obj = match value.as_object() {
Some(o) => o,
None => return normalise(&value.to_string()),
};
const CONTENT_KEYS: &[&str] = &["mes", "content", "message", "text"];
let content = CONTENT_KEYS
.iter()
.find_map(|k| obj.get(*k)?.as_str())
.map(str::to_owned);
const ROLE_KEYS: &[&str] = &["name", "role", "speaker", "author"];
let role = ROLE_KEYS
.iter()
.find_map(|k| obj.get(*k)?.as_str())
.filter(|s| !s.is_empty())
.map(str::to_owned);
match (role, content) {
(Some(r), Some(c)) => normalise(&format!("{r}: {c}")),
(None, Some(c)) => normalise(&c),
_ => {
let fallback = obj
.iter()
.filter_map(|(k, v)| v.as_str().map(|s| format!("{k}: {s}")))
.collect::<Vec<_>>()
.join("\n");
if fallback.is_empty() {
normalise(&serde_json::to_string(value).unwrap_or_default())
} else {
normalise(&fallback)
}
}
}
}
pub fn chunk_document(title: &str, body: &str) -> Vec<String> {
let paragraphs: Vec<&str> = body
.split("\n\n")
.map(str::trim)
.filter(|s| !s.is_empty())
.collect();
if paragraphs.is_empty() {
return vec![title.to_owned()];
}
paragraphs
.iter()
.map(|p| {
if title.is_empty() {
p.to_string()
} else {
format!("{title}\n\n{p}")
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn two_paragraphs() {
let chunks = chunk_document("Title", "First.\n\nSecond.");
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0], "Title\n\nFirst.");
assert_eq!(chunks[1], "Title\n\nSecond.");
}
#[test]
fn empty_body_returns_title() {
let chunks = chunk_document("Title", "");
assert_eq!(chunks, vec!["Title"]);
}
#[test]
fn blank_only_body_returns_title() {
let chunks = chunk_document("Title", " \n\n ");
assert_eq!(chunks, vec!["Title"]);
}
#[test]
fn filters_empty_paragraphs() {
let chunks = chunk_document("T", "Para 1.\n\n\n\nPara 2.");
assert_eq!(chunks.len(), 2);
}
#[test]
fn empty_title() {
let chunks = chunk_document("", "Only body.");
assert_eq!(chunks, vec!["Only body."]);
}
#[test]
fn markdown_line_numbers() {
let c = MarkdownChunker;
let chunks = c.chunk("Title", "First.\n\nSecond.");
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line, 0);
assert_eq!(chunks[0].text, "First.");
assert_eq!(chunks[1].line, 2);
assert_eq!(chunks[1].text, "Second.");
}
#[test]
fn markdown_leading_blank_lines() {
let c = MarkdownChunker;
let chunks = c.chunk("T", "\n\nActual paragraph.");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].line, 2);
}
#[test]
fn markdown_empty_body_returns_title_at_line_0() {
let c = MarkdownChunker;
let chunks = c.chunk("Title", "");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].line, 0);
assert_eq!(chunks[0].text, "Title");
}
#[test]
fn markdown_column_always_zero() {
let c = MarkdownChunker;
let chunks = c.chunk("T", "A.\n\nB.\n\nC.");
for ch in &chunks {
assert_eq!(ch.column, 0, "expected column 0 for markdown chunk");
}
}
#[test]
fn jsonl_line_numbers() {
let c = JsonChunker;
let jsonl = concat!(
"{\"name\":\"User\",\"is_user\":true,\"mes\":\"Hello there\"}\n",
"{\"name\":\"Aria\",\"is_user\":false,\"mes\":\"Hi! How can I help?\"}"
);
let chunks = c.chunk("chat.jsonl", jsonl);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].line, 0, "first message should be on line 0");
assert_eq!(chunks[1].line, 1, "second message should be on line 1");
assert!(
chunks[0].text.contains("Hello there"),
"got: {}",
chunks[0].text
);
assert!(
chunks[1].text.contains("Hi! How can I help?"),
"got: {}",
chunks[1].text
);
}
#[test]
fn jsonl_column_always_zero() {
let c = JsonChunker;
let jsonl =
"{\"role\":\"user\",\"content\":\"A\"}\n{\"role\":\"assistant\",\"content\":\"B\"}";
let chunks = c.chunk("chat.jsonl", jsonl);
for ch in &chunks {
assert_eq!(ch.column, 0, "JSONL chunks should always have column 0");
}
}
#[test]
fn json_openai_messages() {
let c = JsonChunker;
let json = "{\n \"messages\": [\n {\"role\":\"user\",\"content\":\"What is 2+2?\"},\n {\"role\":\"assistant\",\"content\":\"4\"}\n ]\n}";
let chunks = c.chunk("session.json", json);
assert_eq!(chunks.len(), 2);
assert!(chunks[0].text.contains("2+2"), "got: {}", chunks[0].text);
assert!(chunks[1].text.contains('4'), "got: {}", chunks[1].text);
assert!(
chunks[0].line >= 2,
"expected line >= 2, got {}",
chunks[0].line
);
}
#[test]
fn json_array() {
let c = JsonChunker;
let json = "[\n {\"role\":\"user\",\"content\":\"Ping\"},\n {\"role\":\"assistant\",\"content\":\"Pong\"}\n]";
let chunks = c.chunk("msgs.json", json);
assert_eq!(chunks.len(), 2);
assert!(chunks[0].text.contains("Ping"), "got: {}", chunks[0].text);
assert!(chunks[1].text.contains("Pong"), "got: {}", chunks[1].text);
assert!(chunks[0].line >= 1, "first element starts on line 1");
}
#[test]
fn json_single_object() {
let c = JsonChunker;
let json = "{\"role\":\"user\",\"content\":\"Just one message\"}";
let chunks = c.chunk("single.json", json);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].line, 0);
assert!(
chunks[0].text.contains("Just one message"),
"got: {}",
chunks[0].text
);
}
#[test]
fn text_has_no_double_newline() {
let c = JsonChunker;
let jsonl = concat!(
"{\"role\":\"user\",\"content\":\"Line one\\n\\nLine two\"}\n",
"{\"role\":\"assistant\",\"content\":\"OK\"}"
);
let chunks = c.chunk("chat.jsonl", jsonl);
for ch in &chunks {
assert!(
!ch.text.contains("\n\n"),
"chunk at line {} contains \\n\\n: {:?}",
ch.line,
ch.text
);
}
}
}