use crate::document::{DocBuilder, Document};
pub async fn recursive_character_text_split(
text: &str,
max_length: Option<usize>,
overlap: Option<usize>,
) -> Vec<Document> {
let max_length = max_length.unwrap_or(1000);
let overlap = overlap.unwrap_or(0);
let separators = vec!["\n\n", "\n", ".", " ", ""];
let mut result = Vec::new();
let mut start = 0;
let text_len = text.len();
while start < text_len {
if start + max_length >= text_len {
result.push(
DocBuilder::new()
.with_page_content(&text[start..].to_string())
.build(),
);
break;
}
let mut end = start + max_length;
let mut found_separator = false;
for separator in &separators {
if separator.is_empty() {
continue;
}
if let Some(index) = text[start..end].rfind(separator) {
end = start + index + separator.len();
found_separator = true;
break;
}
}
if !found_separator {
end = start + max_length;
}
result.push(
DocBuilder::new()
.with_page_content(&text[start..end].to_string())
.build(),
);
start = if end >= text_len {
text_len
} else {
end.saturating_sub(overlap)
};
}
result
}