use crate::types::structure::{
ChunkType, ChunkingOptions, ChunkingResult, CodeChunkingStrategy, ElementData, StructuredChunk,
StructuredDocument, StructuredTable, TableChunkingStrategy,
};
pub struct StructuralChunker {
options: ChunkingOptions,
}
impl Default for StructuralChunker {
fn default() -> Self {
Self::new(ChunkingOptions::default())
}
}
impl StructuralChunker {
#[must_use]
pub fn new(options: ChunkingOptions) -> Self {
Self { options }
}
#[must_use]
pub fn with_max_chars(max_chars: usize) -> Self {
Self {
options: ChunkingOptions {
max_chars,
..Default::default()
},
}
}
#[must_use]
pub fn chunk(&self, doc: &StructuredDocument) -> ChunkingResult {
let mut result = ChunkingResult::empty();
let mut current_text = String::new();
let mut current_start = 0;
let mut pending_heading: Option<&str> = None;
for element in &doc.elements {
match &element.data {
ElementData::Table(table) => {
if !current_text.trim().is_empty() {
self.emit_text_chunk(
&mut result,
¤t_text,
current_start,
element.char_start,
);
current_text.clear();
}
self.chunk_table(&mut result, table, element.char_start, element.char_end);
current_start = element.char_end;
}
ElementData::CodeBlock(block) => {
if !current_text.trim().is_empty() {
self.emit_text_chunk(
&mut result,
¤t_text,
current_start,
element.char_start,
);
current_text.clear();
}
self.chunk_code_block(
&mut result,
&block.format(),
block.language.as_deref(),
element.char_start,
element.char_end,
);
current_start = element.char_end;
}
ElementData::Heading(heading) => {
if self.options.include_section_headers {
pending_heading = Some(heading.format().leak());
}
if !current_text.is_empty() {
current_text.push('\n');
}
current_text.push_str(&heading.format());
}
ElementData::List(list) => {
if self.options.preserve_lists {
let list_text = list.format();
let combined_len = current_text.chars().count() + list_text.chars().count();
if combined_len > self.options.max_chars && !current_text.trim().is_empty()
{
self.emit_text_chunk(
&mut result,
¤t_text,
current_start,
element.char_start,
);
current_text.clear();
current_start = element.char_start;
}
if !current_text.is_empty() {
current_text.push_str("\n\n");
}
current_text.push_str(&list_text);
} else {
let text = element.text();
if !current_text.is_empty() {
current_text.push_str("\n\n");
}
current_text.push_str(&text);
}
}
ElementData::Paragraph { text } => {
let text_len = text.chars().count();
let current_len = current_text.chars().count();
if current_len + text_len > self.options.max_chars
&& !current_text.trim().is_empty()
{
self.emit_text_chunk(
&mut result,
¤t_text,
current_start,
element.char_start,
);
current_text.clear();
current_start = element.char_start;
if let Some(heading) = pending_heading.take() {
current_text.push_str(heading);
current_text.push_str("\n\n");
}
}
if !current_text.is_empty() && !current_text.ends_with('\n') {
current_text.push_str("\n\n");
}
current_text.push_str(text);
}
ElementData::BlockQuote { text } => {
if !current_text.is_empty() {
current_text.push_str("\n\n");
}
current_text.push_str("> ");
current_text.push_str(text);
}
ElementData::Separator => {
if !current_text.trim().is_empty() {
self.emit_text_chunk(
&mut result,
¤t_text,
current_start,
element.char_start,
);
current_text.clear();
}
current_start = element.char_end;
pending_heading = None;
}
ElementData::Raw { text } => {
if !current_text.is_empty() {
current_text.push_str("\n\n");
}
current_text.push_str(text);
}
}
}
if !current_text.trim().is_empty() {
self.emit_text_chunk(&mut result, ¤t_text, current_start, doc.total_chars);
}
result
}
fn emit_text_chunk(
&self,
result: &mut ChunkingResult,
text: &str,
char_start: usize,
char_end: usize,
) {
let index = result.chunks.len();
result.chunks.push(StructuredChunk::text(
text.trim(),
index,
char_start,
char_end,
));
}
fn chunk_table(
&self,
result: &mut ChunkingResult,
table: &StructuredTable,
char_start: usize,
char_end: usize,
) {
result.tables_processed += 1;
match self.options.table_handling {
TableChunkingStrategy::PreserveWhole => {
let index = result.chunks.len();
result.chunks.push(StructuredChunk::table(
&table.raw_text,
index,
&table.id,
char_start,
char_end,
));
}
TableChunkingStrategy::SplitWithHeader => {
let header_text = table.format_header();
let header_chars = header_text.chars().count();
if table.char_count() <= self.options.max_chars {
let index = result.chunks.len();
result.chunks.push(StructuredChunk::table(
&table.raw_text,
index,
&table.id,
char_start,
char_end,
));
return;
}
result.tables_split += 1;
let data_rows: Vec<_> = table.data_rows().collect();
if data_rows.is_empty() {
let index = result.chunks.len();
result.chunks.push(StructuredChunk::table(
&header_text,
index,
&table.id,
char_start,
char_end,
));
return;
}
let max_rows_per_chunk = self.calculate_rows_per_chunk(table, header_chars);
let total_parts = data_rows.len().div_ceil(max_rows_per_chunk);
let mut part = 1;
let mut row_idx = 0;
while row_idx < data_rows.len() {
let end_idx = (row_idx + max_rows_per_chunk).min(data_rows.len());
let rows_in_chunk = &data_rows[row_idx..end_idx];
let mut chunk_text = header_text.clone();
for row in rows_in_chunk {
chunk_text.push('\n');
chunk_text.push_str(&table.format_row(row));
}
let index = result.chunks.len();
if part == 1 {
result.chunks.push(StructuredChunk::table(
&chunk_text,
index,
&table.id,
char_start,
char_end,
));
} else {
result.chunks.push(StructuredChunk::table_continuation(
&chunk_text,
index,
&table.id,
part as u32,
u32::try_from(total_parts).unwrap_or(0),
&header_text,
char_start,
char_end,
));
}
row_idx = end_idx;
part += 1;
}
}
TableChunkingStrategy::Naive => {
let index = result.chunks.len();
result.chunks.push(StructuredChunk::text(
&table.raw_text,
index,
char_start,
char_end,
));
}
}
}
fn calculate_rows_per_chunk(&self, table: &StructuredTable, header_chars: usize) -> usize {
let available = self.options.max_chars.saturating_sub(header_chars + 10);
if available == 0 {
return 1;
}
let total_row_chars: usize = table
.data_rows()
.map(|row| {
row.cells
.iter()
.map(|c| c.text.chars().count())
.sum::<usize>()
+ row.cells.len() * 3 })
.sum();
let row_count = table.data_row_count();
if row_count == 0 {
return 1;
}
let avg_row_chars = total_row_chars / row_count;
if avg_row_chars == 0 {
return row_count;
}
(available / avg_row_chars).max(1)
}
fn chunk_code_block(
&self,
result: &mut ChunkingResult,
formatted_text: &str,
language: Option<&str>,
char_start: usize,
char_end: usize,
) {
result.code_blocks_processed += 1;
match self.options.code_handling {
CodeChunkingStrategy::PreserveWhole => {
let index = result.chunks.len();
result.chunks.push(StructuredChunk {
text: formatted_text.to_string(),
chunk_type: ChunkType::CodeBlock,
index,
element_id: None,
part: None,
total_parts: None,
context: language.map(std::string::ToString::to_string),
char_start,
char_end,
});
}
CodeChunkingStrategy::SplitAtBoundaries => {
let block_chars = formatted_text.chars().count();
if block_chars <= self.options.max_chars {
let index = result.chunks.len();
result.chunks.push(StructuredChunk {
text: formatted_text.to_string(),
chunk_type: ChunkType::CodeBlock,
index,
element_id: None,
part: None,
total_parts: None,
context: language.map(std::string::ToString::to_string),
char_start,
char_end,
});
} else {
self.split_code_at_boundaries(
result,
formatted_text,
language,
char_start,
char_end,
);
}
}
CodeChunkingStrategy::SplitWithOverlap => {
self.split_code_with_overlap(
result,
formatted_text,
language,
char_start,
char_end,
);
}
}
}
fn split_code_at_boundaries(
&self,
result: &mut ChunkingResult,
formatted_text: &str,
language: Option<&str>,
char_start: usize,
char_end: usize,
) {
let lines: Vec<&str> = formatted_text.lines().collect();
let mut chunks = Vec::new();
let mut current_chunk = Vec::new();
let mut current_chars = 0;
let fence_start = lines.first().copied().unwrap_or("```");
let fence_end = lines.last().copied().unwrap_or("```");
let content_lines = &lines[1..lines.len().saturating_sub(1)];
for (i, line) in content_lines.iter().enumerate() {
let line_chars = line.chars().count() + 1;
let is_boundary = line.trim().is_empty()
|| line.trim().starts_with("fn ")
|| line.trim().starts_with("def ")
|| line.trim().starts_with("function ")
|| line.trim().starts_with("class ")
|| line.trim().starts_with("impl ");
if is_boundary && current_chars > self.options.max_chars / 2 && i > 0 {
if !current_chunk.is_empty() {
chunks.push(current_chunk.join("\n"));
current_chunk.clear();
current_chars = 0;
}
}
current_chunk.push(*line);
current_chars += line_chars;
}
if !current_chunk.is_empty() {
chunks.push(current_chunk.join("\n"));
}
let total_parts = chunks.len();
for (i, chunk_content) in chunks.into_iter().enumerate() {
let index = result.chunks.len();
let chunk_text = format!(
"{}{}\n{}\n{}",
fence_start,
language.unwrap_or(""),
chunk_content,
fence_end
);
if i == 0 {
result.chunks.push(StructuredChunk {
text: chunk_text,
chunk_type: ChunkType::CodeBlock,
index,
element_id: None,
part: Some(1),
total_parts: Some(u32::try_from(total_parts).unwrap_or(0)),
context: language.map(std::string::ToString::to_string),
char_start,
char_end,
});
} else {
result.chunks.push(StructuredChunk {
text: chunk_text,
chunk_type: ChunkType::CodeBlockContinuation,
index,
element_id: None,
part: Some(u32::try_from(i + 1).unwrap_or(0)),
total_parts: Some(u32::try_from(total_parts).unwrap_or(0)),
context: language.map(std::string::ToString::to_string),
char_start,
char_end,
});
}
}
}
fn split_code_with_overlap(
&self,
result: &mut ChunkingResult,
formatted_text: &str,
language: Option<&str>,
char_start: usize,
char_end: usize,
) {
let lines: Vec<&str> = formatted_text.lines().collect();
let overlap_lines = (self.options.overlap_chars / 40).max(2);
let fence_start = lines.first().copied().unwrap_or("```");
let fence_end = lines.last().copied().unwrap_or("```");
let content_lines = &lines[1..lines.len().saturating_sub(1)];
let mut chunks = Vec::new();
let mut start_line = 0;
while start_line < content_lines.len() {
let mut current_chars = 0;
let mut end_line = start_line;
while end_line < content_lines.len() {
current_chars += content_lines[end_line].chars().count() + 1;
if current_chars > self.options.max_chars {
break;
}
end_line += 1;
}
if end_line == start_line {
end_line = start_line + 1;
}
let chunk_lines: Vec<&str> = content_lines[start_line..end_line].to_vec();
chunks.push(chunk_lines.join("\n"));
start_line = if end_line >= content_lines.len() {
content_lines.len()
} else {
end_line.saturating_sub(overlap_lines)
};
}
let total_parts = chunks.len();
for (i, chunk_content) in chunks.into_iter().enumerate() {
let index = result.chunks.len();
let chunk_text = format!(
"{}{}\n{}\n{}",
fence_start,
language.unwrap_or(""),
chunk_content,
fence_end
);
let chunk_type = if i == 0 {
ChunkType::CodeBlock
} else {
ChunkType::CodeBlockContinuation
};
result.chunks.push(StructuredChunk {
text: chunk_text,
chunk_type,
index,
element_id: None,
part: Some(u32::try_from(i + 1).unwrap_or(0)),
total_parts: Some(u32::try_from(total_parts).unwrap_or(0)),
context: language.map(std::string::ToString::to_string),
char_start,
char_end,
});
}
}
}
#[must_use]
pub fn chunk_structured(doc: &StructuredDocument) -> ChunkingResult {
StructuralChunker::default().chunk(doc)
}
#[must_use]
pub fn chunk_structured_with_max(doc: &StructuredDocument, max_chars: usize) -> ChunkingResult {
StructuralChunker::with_max_chars(max_chars).chunk(doc)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::structure::detect_structure;
#[test]
fn test_simple_text_chunking() {
let text = "This is a simple paragraph.\n\nAnother paragraph here.";
let doc = detect_structure(text);
let result = chunk_structured(&doc);
assert!(!result.chunks.is_empty());
assert_eq!(result.tables_processed, 0);
}
#[test]
fn test_table_preserved_when_small() {
let text = r"Introduction.
| Name | Age |
|------|-----|
| Alice | 30 |
| Bob | 25 |
Conclusion.";
let doc = detect_structure(text);
let result = chunk_structured(&doc);
let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
assert_eq!(table_chunks.len(), 1);
assert_eq!(result.tables_processed, 1);
assert_eq!(result.tables_split, 0);
}
#[test]
fn test_large_table_split_with_header() {
let mut rows = String::new();
for i in 1..=50 {
rows.push_str(&format!(
"| Row {} with some data | More data here | Even more |\n",
i
));
}
let text = format!(
r"Introduction.
| Column A | Column B | Column C |
|----------|----------|----------|
{}
Conclusion.",
rows
);
let doc = detect_structure(&text);
let chunker = StructuralChunker::with_max_chars(500);
let result = chunker.chunk(&doc);
let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
assert!(table_chunks.len() > 1, "Large table should be split");
assert_eq!(result.tables_split, 1);
for chunk in &table_chunks {
assert!(
chunk.text.contains("| Column A |"),
"Each table chunk should contain header"
);
}
for chunk in table_chunks.iter().skip(1) {
assert_eq!(chunk.chunk_type, ChunkType::TableContinuation);
assert!(chunk.context.is_some());
}
}
#[test]
fn test_code_block_preserved() {
let text = r#"Here is code:
```rust
fn main() {
println!("Hello!");
}
```
Done."#;
let doc = detect_structure(text);
let result = chunk_structured(&doc);
let code_chunks: Vec<_> = result
.chunks
.iter()
.filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock))
.collect();
assert_eq!(code_chunks.len(), 1);
assert!(code_chunks[0].text.contains("fn main()"));
}
#[test]
fn test_mixed_content() {
let text = r#"# Report
## Summary
This is the summary section.
| Item | Count |
|------|-------|
| A | 10 |
| B | 20 |
## Code
```python
def hello():
print("Hello")
```
## Conclusion
All done."#;
let doc = detect_structure(text);
let result = chunk_structured(&doc);
assert!(result.tables_processed >= 1);
assert!(result.code_blocks_processed >= 1);
assert!(result.chunks.len() >= 3);
}
#[test]
fn test_table_header_formatting() {
let text = r"| Col1 | Col2 | Col3 |
|------|------|------|
| A1 | A2 | A3 |
| B1 | B2 | B3 |";
let doc = detect_structure(text);
let table = doc.tables().next().unwrap();
let header = table.format_header();
assert!(header.contains("| Col1 | Col2 | Col3 |"));
assert!(header.contains("|---|---|---|"));
}
#[test]
fn test_preserve_whole_strategy() {
let mut rows = String::new();
for i in 1..=20 {
rows.push_str(&format!("| Data {} | Value |\n", i));
}
let text = format!(
r"| Header1 | Header2 |
|---------|---------|
{}",
rows
);
let doc = detect_structure(&text);
let chunker = StructuralChunker::new(ChunkingOptions {
max_chars: 500,
table_handling: TableChunkingStrategy::PreserveWhole,
..Default::default()
});
let result = chunker.chunk(&doc);
let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
assert_eq!(table_chunks.len(), 1);
assert_eq!(result.tables_split, 0);
}
#[test]
fn test_chunking_result_stats() {
let text = r"| A | B |
|---|---|
| 1 | 2 |
```python
x = 1
```
| C | D |
|---|---|
| 3 | 4 |";
let doc = detect_structure(text);
let result = chunk_structured(&doc);
assert_eq!(result.tables_processed, 2);
assert_eq!(result.code_blocks_processed, 1);
}
}