use std::sync::OnceLock;
use regex::Regex;
use sha2::{Digest, Sha256};
use crate::error::StorageError;
use crate::storage::watchtower as store;
use crate::storage::DbPool;
#[derive(Debug, thiserror::Error)]
pub enum ChunkerError {
#[error("storage error: {0}")]
Storage(#[from] StorageError),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Fragment {
pub heading_path: String,
pub text: String,
pub index: usize,
}
fn heading_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"^(#{1,6})\s+(.+)$").expect("heading regex"))
}
pub fn extract_fragments(body: &str) -> Vec<Fragment> {
let mut fragments: Vec<Fragment> = Vec::new();
let mut heading_stack: Vec<(usize, String)> = Vec::new();
let mut current_text = String::new();
let mut in_code_block = false;
for line in body.lines() {
if line.trim_start().starts_with("```") {
in_code_block = !in_code_block;
current_text.push_str(line);
current_text.push('\n');
continue;
}
if in_code_block {
current_text.push_str(line);
current_text.push('\n');
continue;
}
if let Some(caps) = heading_re().captures(line) {
flush_fragment(&heading_stack, &mut current_text, &mut fragments);
let level = caps[1].len();
let heading_text = caps[2].trim().to_string();
let heading_label = format!("{} {heading_text}", &caps[1]);
while heading_stack.last().is_some_and(|(lvl, _)| *lvl >= level) {
heading_stack.pop();
}
heading_stack.push((level, heading_label));
} else {
current_text.push_str(line);
current_text.push('\n');
}
}
flush_fragment(&heading_stack, &mut current_text, &mut fragments);
for (i, frag) in fragments.iter_mut().enumerate() {
frag.index = i;
}
fragments
}
fn flush_fragment(
heading_stack: &[(usize, String)],
current_text: &mut String,
fragments: &mut Vec<Fragment>,
) {
let trimmed = current_text.trim();
if !trimmed.is_empty() {
let heading_path = heading_stack
.iter()
.map(|(_, h)| h.as_str())
.collect::<Vec<_>>()
.join("/");
fragments.push(Fragment {
heading_path,
text: trimmed.to_string(),
index: 0, });
}
current_text.clear();
}
pub async fn chunk_node(
pool: &DbPool,
account_id: &str,
node_id: i64,
body_text: &str,
) -> Result<Vec<i64>, ChunkerError> {
let fragments = extract_fragments(body_text);
store::mark_chunks_stale(pool, account_id, node_id).await?;
let new_chunks: Vec<store::NewChunk> = fragments
.iter()
.map(|f| {
let mut hasher = Sha256::new();
hasher.update(f.text.as_bytes());
let hash = format!("{:x}", hasher.finalize());
store::NewChunk {
heading_path: f.heading_path.clone(),
chunk_text: f.text.clone(),
chunk_hash: hash,
chunk_index: f.index as i64,
}
})
.collect();
let ids = store::upsert_chunks_for_node(pool, account_id, node_id, &new_chunks).await?;
super::graph_ingest::extract_and_persist_graph(pool, account_id, node_id, body_text).await;
store::mark_node_chunked(pool, account_id, node_id).await?;
Ok(ids)
}
pub async fn chunk_pending_nodes(pool: &DbPool, account_id: &str, limit: u32) -> u32 {
let nodes = match store::get_pending_content_nodes_for(pool, account_id, limit).await {
Ok(n) => n,
Err(e) => {
tracing::warn!(error = %e, "Failed to get pending nodes for chunking");
return 0;
}
};
let mut chunked = 0u32;
for node in &nodes {
match chunk_node(pool, account_id, node.id, &node.body_text).await {
Ok(_ids) => {
chunked += 1;
tracing::debug!(
node_id = node.id,
path = %node.relative_path,
"Chunked content node"
);
}
Err(e) => {
tracing::warn!(
node_id = node.id,
path = %node.relative_path,
error = %e,
"Failed to chunk node"
);
}
}
}
chunked
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn chunk_simple_markdown() {
let body = "First paragraph.\n\nSecond paragraph.\n";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 1);
assert_eq!(frags[0].heading_path, "");
assert!(frags[0].text.contains("First paragraph"));
assert!(frags[0].text.contains("Second paragraph"));
}
#[test]
fn chunk_splits_on_headings() {
let body = "\
## Introduction
Welcome to the guide.
## Getting Started
Install the CLI tool.
## Advanced Usage
Use the `--verbose` flag.
";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 3);
assert_eq!(frags[0].heading_path, "## Introduction");
assert!(frags[0].text.contains("Welcome"));
assert_eq!(frags[1].heading_path, "## Getting Started");
assert!(frags[1].text.contains("Install"));
assert_eq!(frags[2].heading_path, "## Advanced Usage");
assert!(frags[2].text.contains("--verbose"));
}
#[test]
fn chunk_empty_input_returns_empty() {
assert!(extract_fragments("").is_empty());
}
#[test]
fn chunk_whitespace_only_returns_empty() {
assert!(extract_fragments(" \n\n\t\n ").is_empty());
}
#[test]
fn chunk_long_text_single_fragment() {
let long = "word ".repeat(500);
let frags = extract_fragments(&long);
assert_eq!(frags.len(), 1);
assert_eq!(frags[0].heading_path, "");
assert_eq!(frags[0].index, 0);
}
#[test]
fn chunk_heading_hierarchy_preserved() {
let body = "\
# Top Level
Intro text.
## Section One
Content one.
### Subsection A
Deep content A.
### Subsection B
Deep content B.
## Section Two
Content two.
";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 5);
assert_eq!(frags[0].heading_path, "# Top Level");
assert_eq!(frags[1].heading_path, "# Top Level/## Section One");
assert_eq!(
frags[2].heading_path,
"# Top Level/## Section One/### Subsection A"
);
assert_eq!(
frags[3].heading_path,
"# Top Level/## Section One/### Subsection B"
);
assert_eq!(frags[4].heading_path, "# Top Level/## Section Two");
}
#[test]
fn chunk_root_text_before_heading() {
let body = "Preamble text.\n\n## Heading\n\nBody.\n";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 2);
assert_eq!(frags[0].heading_path, "");
assert!(frags[0].text.contains("Preamble"));
assert_eq!(frags[1].heading_path, "## Heading");
}
#[test]
fn chunk_code_block_headings_ignored() {
let body = "\
## Real Heading
Some text.
```markdown
# This is inside a code block
## Also inside
```
More text after code.
";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 1);
assert_eq!(frags[0].heading_path, "## Real Heading");
assert!(frags[0].text.contains("# This is inside a code block"));
assert!(frags[0].text.contains("More text after code"));
}
#[test]
fn chunk_consecutive_headings_skip_empty() {
let body = "## A\n## B\n## C\nFinal content.\n";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 1);
assert_eq!(frags[0].heading_path, "## C");
assert!(frags[0].text.contains("Final content"));
}
#[test]
fn chunk_indices_sequential() {
let body = "Intro.\n\n## A\n\nA text.\n\n## B\n\nB text.\n\n## C\n\nC text.\n";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 4);
for (i, frag) in frags.iter().enumerate() {
assert_eq!(frag.index, i, "fragment {i} should have index {i}");
}
}
#[test]
fn chunk_h6_heading() {
let body = "###### Deep Heading\n\nDeep content.\n";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 1);
assert_eq!(frags[0].heading_path, "###### Deep Heading");
}
#[test]
fn chunk_heading_level_reset() {
let body = "\
### Level 3
Content at 3.
## Level 2
Content at 2.
";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 2);
assert_eq!(frags[0].heading_path, "### Level 3");
assert_eq!(frags[1].heading_path, "## Level 2");
}
#[test]
fn chunk_fragment_text_trimmed() {
let body = "## Heading\n\n\n Some text. \n\n\n";
let frags = extract_fragments(body);
assert_eq!(frags.len(), 1);
assert_eq!(frags[0].text, "Some text.");
}
}