chonkier 0.0.2

🦛 Chonkie, now in Rust 🦀: No-nonsense, ultra-fast, ultra-light chunking library
Documentation
/*
    The Chunk type contains a bunch of important metadata about the chunk.
    It contains the chunk text, indices based on the original text, and the
    token count, along with other important metadata.
*/

// Import the Display trait
use serde::{Deserialize, Serialize};
use std::fmt;

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Chunk {
    // The text of the chunk
    pub text: String,

    // The start index of the chunk in the original text
    pub start_index: usize,

    // The end index of the chunk in the original text
    pub end_index: usize,

    // The token count of the chunk
    pub token_count: usize,
}

impl Chunk {
    // Create a new chunk (Constructor)
    pub fn new(text: String, start_index: usize, end_index: usize, token_count: usize) -> Self {
        Self {
            text,
            start_index,
            end_index,
            token_count,
        }
    }

    // Get the length of the chunk
    pub fn len(&self) -> usize {
        self.text.len()
    }

    pub fn is_empty(&self) -> bool{
        self.text.is_empty()
    }
}

impl fmt::Display for Chunk {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "Chunk(text: {}, start_index: {}, end_index: {}, token_count: {})",
            self.text, self.start_index, self.end_index, self.token_count
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_chunk_new() {
        let chunk = Chunk::new("Hello, world!".to_string(), 0, 12, 2);
        assert_eq!(chunk.text, "Hello, world!");
        assert_eq!(chunk.start_index, 0);
        assert_eq!(chunk.end_index, 12);
        assert_eq!(chunk.token_count, 2);
    }

    #[test]
    fn test_chunk_clone() {
        let chunk = Chunk::new("Hello, world!".to_string(), 0, 12, 2);
        let cloned_chunk = chunk.clone();
        assert_eq!(cloned_chunk.text, "Hello, world!");
        assert_eq!(cloned_chunk.start_index, 0);
        assert_eq!(cloned_chunk.end_index, 12);
        assert_eq!(cloned_chunk.token_count, 2);
    }

    #[test]
    fn test_chunk_len() {
        let chunk = Chunk::new("Hello, world!".to_string(), 0, 12, 2);
        assert_eq!(chunk.len(), 13);
    }

    #[test]
    fn test_chunk_display() {
        let chunk = Chunk::new("Hello, world!".to_string(), 0, 12, 2);
        assert_eq!(
            format!("{}", chunk),
            "Chunk(text: Hello, world!, start_index: 0, end_index: 12, token_count: 2)"
        );
    }

    #[test]
    #[cfg(feature = "json")]
    fn test_chunk_serde() {
        let chunk = Chunk::new("Hello, world!".to_string(), 0, 12, 2);
        let serialized = serde_json::to_string(&chunk).unwrap();
        let deserialized: Chunk = serde_json::from_str(&serialized).unwrap();
        assert_eq!(deserialized.text, "Hello, world!");
        assert_eq!(deserialized.start_index, 0);
        assert_eq!(deserialized.end_index, 12);
        assert_eq!(deserialized.token_count, 2);
    }
}