vectus 0.1.36

A vector database implemented in Rust for learning purposes.
Documentation
use crate::document::{DocBuilder, Document};

pub async fn recursive_character_text_split(
    text: &str,
    max_length: Option<usize>,
    overlap: Option<usize>,
) -> Vec<Document> {
    let max_length = max_length.unwrap_or(1000);
    let overlap = overlap.unwrap_or(0);

    let separators = vec!["\n\n", "\n", ".", " ", ""];

    let mut result = Vec::new();
    let mut start = 0;
    let text_len = text.len();

    while start < text_len {
        if start + max_length >= text_len {
            result.push(
                DocBuilder::new()
                    .with_page_content(&text[start..].to_string())
                    .build(),
            );
            break;
        }

        let mut end = start + max_length;
        let mut found_separator = false;

        for separator in &separators {
            if separator.is_empty() {
                continue;
            }

            if let Some(index) = text[start..end].rfind(separator) {
                end = start + index + separator.len();
                found_separator = true;
                break;
            }
        }

        if !found_separator {
            end = start + max_length;
        }

        result.push(
            DocBuilder::new()
                .with_page_content(&text[start..end].to_string())
                .build(),
        );

        start = if end >= text_len {
            text_len
        } else {
            end.saturating_sub(overlap)
        };
    }

    result
}