const MIN_CHUNK_LEN: usize = 80;
pub fn chunk(text: &str) -> Vec<String> {
let chunks = if text.contains("## ") || text.contains("# ") {
chunk_markdown(text)
} else if let Ok(value) = serde_json::from_str::<serde_json::Value>(text) {
chunk_json(&value)
} else {
chunk_paragraphs(text)
};
merge_short(chunks)
}
fn chunk_markdown(text: &str) -> Vec<String> {
let mut chunks: Vec<String> = Vec::new();
let mut current = String::new();
for line in text.lines() {
if (line.starts_with("# ") || line.starts_with("## ") || line.starts_with("### "))
&& !current.trim().is_empty()
{
chunks.push(current.trim().to_string());
current = String::new();
}
current.push_str(line);
current.push('\n');
}
if !current.trim().is_empty() {
chunks.push(current.trim().to_string());
}
chunks
}
fn chunk_json(value: &serde_json::Value) -> Vec<String> {
match value {
serde_json::Value::Array(arr) => arr
.iter()
.map(|v| serde_json::to_string_pretty(v).unwrap_or_default())
.filter(|s| !s.is_empty())
.collect(),
serde_json::Value::Object(map) => map
.iter()
.map(|(k, v)| {
format!(
"{}: {}",
k,
serde_json::to_string_pretty(v).unwrap_or_default()
)
})
.collect(),
_ => vec![value.to_string()],
}
}
fn chunk_paragraphs(text: &str) -> Vec<String> {
text.split("\n\n")
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
pub fn merge_short(chunks: Vec<String>) -> Vec<String> {
let mut result: Vec<String> = Vec::new();
let mut carry = String::new();
for chunk in chunks {
if carry.is_empty() {
carry = chunk;
} else if carry.len() < MIN_CHUNK_LEN {
carry.push_str("\n\n");
carry.push_str(&chunk);
} else {
result.push(carry);
carry = chunk;
}
}
if !carry.is_empty() {
result.push(carry);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
fn long(s: &str) -> String {
format!("{s} {}", "x".repeat(MIN_CHUNK_LEN))
}
#[test]
fn markdown_splits_on_headers() {
let text = format!(
"## Section A\n{}\n\n## Section B\n{}",
long("Content about alpha topics."),
long("Content about beta topics.")
);
let chunks = chunk(&text);
assert!(
chunks.len() >= 2,
"expected ≥2 chunks, got {}",
chunks.len()
);
assert!(chunks.iter().any(|c| c.contains("Section A")));
assert!(chunks.iter().any(|c| c.contains("Section B")));
}
#[test]
fn json_array_splits_per_element() {
let items: Vec<String> = (0..3)
.map(|i| format!(r#"{{"key_{i}": "{}"}}"#, "v".repeat(MIN_CHUNK_LEN)))
.collect();
let text = format!("[{}]", items.join(","));
let chunks = chunk(&text);
assert!(!chunks.is_empty());
}
#[test]
fn plain_text_splits_on_blank_lines() {
let text = format!(
"{}\n\n{}",
long("First paragraph with substantial content."),
long("Second paragraph with substantial content.")
);
let chunks = chunk(&text);
assert_eq!(chunks.len(), 2);
}
#[test]
fn short_chunks_are_merged() {
let short = "Hi.\n\nBye.\n\nOk.";
let chunks = chunk(short);
assert_eq!(chunks.len(), 1);
}
#[test]
fn empty_input_returns_empty() {
assert!(chunk("").is_empty());
assert!(chunk(" \n\n ").is_empty());
}
}