chonkier 0.0.2

🦛 Chonkie, now in Rust 🦀: No-nonsense, ultra-fast, ultra-light chunking library
Documentation
/*
    A module containg the types for the sentence chunker.

    The types are:
    - SentenceChunk: A chunk of text that is a sentence.
    - Sentence: A sentence in the text.
*/

// Import the Display trait
use serde::{Deserialize, Serialize};
use std::fmt;

#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Sentence {
    // The text of the sentence
    pub text: String,
    // The start index of the sentence in the original text
    pub start_index: usize,
    // The end index of the sentence in the original text
    pub end_index: usize,
    // The token count of the sentence
    pub token_count: usize,
}

//
impl Sentence {
    // Create a new sentence
    pub fn new(text: &str, start_index: usize, end_index: usize, token_count: usize) -> Self {
        Self {
            text: text.to_string(),
            start_index,
            end_index,
            token_count,
        }
    }

    // Get the length of the sentence
    pub fn len(&self) -> usize {
        self.text.len()
    }

    pub fn is_empty(&self) -> bool{
        self.text.is_empty()
    }
}

impl fmt::Display for Sentence {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "Sentence(text: {}, start_index: {}, end_index: {}, token_count: {})",
            self.text, self.start_index, self.end_index, self.token_count
        )
    }
}

#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SentenceChunk {
    // The text of the chunk
    pub text: String,
    // The start index of the chunk in the original text
    pub start_index: usize,
    // The end index of the chunk in the original text
    pub end_index: usize,
    // The token count of the chunk
    pub token_count: usize,
    // The sentences in the chunk
    pub sentences: Option<Vec<Sentence>>,
}

impl SentenceChunk {
    // Create a new sentence chunk
    pub fn new(
        text: &str,
        start_index: usize,
        end_index: usize,
        token_count: usize,
        sentences: Option<Vec<Sentence>>,
    ) -> Self {
        Self {
            text: text.to_string(),
            start_index,
            end_index,
            token_count,
            sentences,
        }
    }

    // Return a deep copy of the sentence chunk
    pub fn copy(&self) -> Self {
        Self {
            text: self.text.clone(),
            start_index: self.start_index,
            end_index: self.end_index,
            token_count: self.token_count,
            sentences: self.sentences.clone(),
        }
    }

    // Get the length of the sentence chunk
    pub fn len(&self) -> usize {
        self.text.len()
    }

    pub fn is_empty(&self) -> bool{
        self.text.is_empty()
    }
}

impl fmt::Display for SentenceChunk {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "SentenceChunk(text: {}, start_index: {}, end_index: {}, token_count: {}, sentences: {:?})",
            self.text, self.start_index, self.end_index, self.token_count, self.sentences
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_sentence_new() {
        let sentence = Sentence::new("Hello, world!", 0, 12, 2);
        assert_eq!(sentence.text, "Hello, world!");
        assert_eq!(sentence.start_index, 0);
        assert_eq!(sentence.end_index, 12);
        assert_eq!(sentence.token_count, 2);
    }

    #[test]
    fn test_sentence_len() {
        let sentence = Sentence::new("Hello, world!", 0, 12, 2);
        assert_eq!(sentence.len(), 13);
    }

    #[test]
    fn test_sentence_display() {
        let sentence = Sentence::new("Hello, world!", 0, 12, 2);
        assert_eq!(
            format!("{}", sentence),
            "Sentence(text: Hello, world!, start_index: 0, end_index: 12, token_count: 2)"
        );
    }

    #[test]
    fn test_sentence_chunk_new() {
        let sentence_chunk = SentenceChunk::new("Hello, world!", 0, 12, 2, None);
        assert_eq!(sentence_chunk.text, "Hello, world!");
        assert_eq!(sentence_chunk.start_index, 0);
        assert_eq!(sentence_chunk.end_index, 12);
        assert_eq!(sentence_chunk.token_count, 2);
    }

    #[test]
    #[cfg(feature = "json")]
    fn test_sentence_chunk_serde() {
        let sentence_chunk = SentenceChunk::new("Hello, world!", 0, 12, 2, None);
        let serialized = serde_json::to_string(&sentence_chunk).unwrap();
        let deserialized: SentenceChunk = serde_json::from_str(&serialized).unwrap();
        assert_eq!(deserialized.text, "Hello, world!");
        assert_eq!(deserialized.start_index, 0);
        assert_eq!(deserialized.end_index, 12);
        assert_eq!(deserialized.token_count, 2);
    }
}