vectus 0.1.37

A vector database implemented in Rust for learning purposes.
Documentation
use crate::document::{DocBuilder, Document};

pub async fn recursive_character_text_split(
    text: &str,
    max_length: Option<usize>,
    overlap: Option<usize>,
) -> Vec<Document> {
    let max_length = max_length.unwrap_or(1000);
    let overlap = overlap.unwrap_or(0);

    let separators = vec!["\n\n", "\n", ".", " ", ""];

    let mut result = Vec::new();
    let mut start = 0;
    let text_len = text.len();

    while start < text_len {
        let mut end = start + max_length;

        if end >= text_len {
            end = text_len;
        } else {
            while !text.is_char_boundary(end) {
                end -= 1;
            }
        }

        let mut found_separator = false;

        for separator in &separators {
            if separator.is_empty() {
                continue;
            }

            if let Some(index) = text[start..end].rfind(separator) {
                end = start + index + separator.len();
                found_separator = true;
                break;
            }
        }

        if !found_separator {
            end = start + max_length;
            if end < text_len {
                while !text.is_char_boundary(end) {
                    end -= 1;
                }
            }
        }

        result.push(
            DocBuilder::new()
                .with_page_content(&text[start..end].to_string())
                .build(),
        );

        start = if end >= text_len {
            text_len
        } else {
            end.saturating_sub(overlap)
        };
    }

    result
}