chonkier 0.0.2

🦛 Chonkie, now in Rust 🦀: No-nonsense, ultra-fast, ultra-light chunking library
Documentation
/*
    This file contains the necessary types for the RecursiveChunker implementation.
    This includes structs for RecursiveLevels and RecursiveRules —— that together
    define the rules for the RecursiveChunker.

    Alongside these, I will also define the RecursiveChunk struct as well.
*/

// Import the Display trait
use serde::{Deserialize, Serialize};
use std::fmt;

// Define the RecursiveChunk struct
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct RecursiveChunk {
    // Text of the chunk
    pub text: String,

    // The start index of the chunk
    pub start_index: usize,

    // The end index of the chunk
    pub end_index: usize,

    // The token count of the chunk
    pub token_count: usize,

    // The level of the chunk
    pub level: usize,
}

impl RecursiveChunk {
    pub fn new(
        text: String,
        start_index: usize,
        end_index: usize,
        token_count: usize,
        level: usize,
    ) -> Self {
        RecursiveChunk {
            text,
            start_index,
            end_index,
            token_count,
            level,
        }
    }

    pub fn new_without_level(
        text: String,
        start_index: usize,
        end_index: usize,
        token_count: usize,
    ) -> Self {
        RecursiveChunk {
            text,
            start_index,
            end_index,
            token_count,
            level: 0,
        }
    }
}

impl fmt::Display for RecursiveChunk {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "RecursiveChunk(text: {}, start_index: {}, end_index: {}, token_count: {}, level: {})",
            self.text, self.start_index, self.end_index, self.token_count, self.level
        )
    }
}

// Define the RecursiveLevel struct
#[derive(Debug, Clone)]
pub struct RecursiveLevel {
    // Get an (fixed size) array of the delimiters
    pub delimiters: Vec<String>,

    // Get a bool value for whitespace
    pub whitespace: bool,
}
// NOTE: When both the whitespace is false and the delimiters are empty,
// then we can safely assume that the user is asking use to chunk it
// via the token chunking logic.

impl RecursiveLevel {
    pub fn new(delimiters: Vec<String>, whitespace: bool) -> Self {
        RecursiveLevel {
            delimiters,
            whitespace,
        }
    }
}

// The default implementation for the RecursiveLevel works on the
// token chunking logic.
impl Default for RecursiveLevel {
    fn default() -> Self {
        RecursiveLevel {
            delimiters: Vec::new(),
            whitespace: false,
        }
    }
}

// Define the RecusiveRules struct
#[derive(Debug, Clone)]
pub struct RecursiveRules {
    // Get a list of RecursiveLevel
    pub levels: Vec<RecursiveLevel>,
}

impl RecursiveRules {
    pub fn new(levels: Vec<RecursiveLevel>) -> Self {
        RecursiveRules { levels }
    }
}

impl Default for RecursiveRules {
    fn default() -> Self {
        // First level would be to have the paragraphing logic
        let mut default_rules: Vec<RecursiveLevel> = Vec::new();
        default_rules.push(RecursiveLevel::new(
            vec!["\n\n".to_string(), "\n\r".to_string()],
            false,
        ));
        // Second level would be to have the sentence logic
        default_rules.push(RecursiveLevel::new(
            vec![
                ".".to_string(),
                "?".to_string(),
                "!".to_string(),
                "\n".to_string(),
                "\r".to_string(),
            ],
            true,
        ));
        // Third level would be to have the sub-sentence logic
        default_rules.push(RecursiveLevel::new(
            vec![";".to_string(), ":".to_string()],
            true,
        ));
        // Fourth level would be to have the word logic
        default_rules.push(RecursiveLevel::new(vec![], true));
        // Final level would be to have the default logic
        default_rules.push(RecursiveLevel::default());
        RecursiveRules {
            levels: default_rules,
        }
    }
}

impl fmt::Display for RecursiveRules {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "RecursiveRules(levels: {:?})", self.levels)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_recursive_rules() {
        let rules = RecursiveRules::default();
        assert_eq!(rules.levels.len(), 5);
    }
}