character_text_splitter 0.1.3

A Rust library for splitting text into chunks with overlap, designed for handling large amounts of text efficiently. Implementation is identical to langchain's CharacterTextSplitter
Documentation
pub struct CharacterTextSplitter {
    chunk_size: usize,
    chunk_overlap: usize,
    separator: String,
}

impl CharacterTextSplitter {
    pub fn new() -> CharacterTextSplitter {
        CharacterTextSplitter {
            chunk_size: 200,
            chunk_overlap: 40,
            separator: "\n\n".to_string(),
        }
    }

    pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
        self.chunk_size = chunk_size;
        self
    }

    pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
        self.chunk_overlap = chunk_overlap;
        self    
    }

    pub fn with_separator(mut self, separator: &str) -> Self {
        self.separator = separator.to_string();
        self
    }
    
    fn merge_splits(&self, splits: Vec<&str>) -> Vec<String> {
        let separator_len = self.separator.len();
        let mut docs: Vec<String> = Vec::new();
        let mut current_doc: Vec<&str> = Vec::new();
        let mut total = 0;

        for d in splits {
            let len = d.len();
            if total
                + len
                + if current_doc.is_empty() {
                    0
                } else {
                    separator_len
                }
                > self.chunk_size
            {
                if total > self.chunk_size {
                    println!(
                        "Created a chunk of size {}, which is longer than the specified {}",
                        total, self.chunk_size
                    );
                }
                if !current_doc.is_empty() {
                    let doc = current_doc.join(&self.separator);
                    if !doc.is_empty() {
                        docs.push(doc);
                    }
                    while total > self.chunk_overlap
                        || (total
                            + len
                            + if current_doc.is_empty() {
                                0
                            } else {
                                separator_len
                            }
                            > self.chunk_size
                            && total > 0)
                    {
                        total -= current_doc[0].len()
                            + if current_doc.len() > 1 {
                                separator_len
                            } else {
                                0
                            };
                        current_doc.remove(0);
                    }
                }
            }
            current_doc.push(d);
            total += len
                + if current_doc.len() > 1 {
                    separator_len
                } else {
                    0
                };
        }

        let doc = current_doc.join(&self.separator);
        if !doc.is_empty() {
            docs.push(doc);
        }

        docs
    }

    pub fn split_text(&self, text: &str) -> Vec<String> {
        let splits: Vec<&str> = text.split(&self.separator).collect();
        self.merge_splits(splits)
    }
}