fn split_into_chunks(
text: &str,
relative_path: &str,
doc_type: DocumentType,
checksum: &str,
quality: f32,
) -> Vec<DocumentChunk> {
let mut chunks = Vec::new();
let mut current = String::new();
let mut chunk_idx: u32 = 0;
for line in text.lines() {
if current.len() + line.len() + 1 > MAX_CHUNK_SIZE && !current.is_empty() {
chunks.push(DocumentChunk {
file_path: relative_path.to_string(),
doc_type: doc_type.clone(),
chunk_index: chunk_idx,
page_number: None,
section_heading: None,
text_content: current.clone(),
file_checksum: checksum.to_string(),
extraction_quality: quality,
});
chunk_idx += 1;
current.clear();
}
if !current.is_empty() {
current.push('\n');
}
current.push_str(line);
}
if !current.trim().is_empty() {
chunks.push(DocumentChunk {
file_path: relative_path.to_string(),
doc_type: doc_type.clone(),
chunk_index: chunk_idx,
page_number: None,
section_heading: None,
text_content: current,
file_checksum: checksum.to_string(),
extraction_quality: quality,
});
}
chunks
}
fn truncate_to_max_chunk(text: &str) -> String {
if text.len() <= MAX_CHUNK_SIZE {
return text.to_string();
}
let mut end = MAX_CHUNK_SIZE;
while end > 0 && !text.is_char_boundary(end) {
end -= 1;
}
match text[..end].rfind(' ') {
Some(pos) => text[..pos].to_string(),
None => text[..end].to_string(),
}
}