1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
//! TextSplitters break text small enough parts to be fed to the model.
//!
//! TextSplitters are responsible for breaking text into small enough parts to be fed to the model. This means that they work with the token stream of the model.
use crate::tokens::{Tokenizer, TokenizerError};
use std::cmp::max;

pub trait TextSplitter<TokenType>: Tokenizer<TokenType>
where
    TokenType: Clone,
{
    fn split_text(
        &self,
        doc: &str,
        max_tokens_per_chunk: usize,
        chunk_overlap: usize,
    ) -> Result<Vec<String>, TokenizerError> {
        let tokens = self.tokenize_str(doc)?;
        let step_size = max(
            max_tokens_per_chunk.checked_sub(chunk_overlap).unwrap_or(1),
            1,
        );

        debug_assert_ne!(step_size, 0);

        (0..tokens.len())
            .step_by(step_size)
            .map(|start_idx| {
                let end_idx = usize::min(start_idx + max_tokens_per_chunk, tokens.len());
                self.to_string(tokens[start_idx..end_idx].to_vec())
            })
            .collect()
    }
}

pub struct NaiveWhitespaceSplitter;

impl Tokenizer<String> for NaiveWhitespaceSplitter {
    fn tokenize_str(&self, doc: &str) -> Result<Vec<String>, TokenizerError> {
        Ok(doc.split_whitespace().map(|t| t.to_string()).collect())
    }

    fn to_string(&self, tokens: Vec<String>) -> Result<String, TokenizerError> {
        Ok(tokens
            .iter()
            .map(|token| token.to_string())
            .collect::<Vec<String>>()
            .join(" "))
    }
}

impl TextSplitter<String> for NaiveWhitespaceSplitter {}

#[cfg(test)]
mod tests {
    use super::{NaiveWhitespaceSplitter, TextSplitter, TokenizerError};

    #[test]
    fn whitespace_splitter_no_overlap() -> Result<(), TokenizerError> {
        let doc = "This is a sample text that will be split into chunks based on tokens.";
        let max_tokens_per_chunk = 4;
        let chunk_overlap = 0;

        let splitter = NaiveWhitespaceSplitter;

        let chunks = splitter.split_text(doc, max_tokens_per_chunk, chunk_overlap)?;

        assert_eq!(
            chunks,
            vec![
                "This is a sample",
                "text that will be",
                "split into chunks based",
                "on tokens."
            ]
        );

        Ok(())
    }

    #[test]
    fn whitespace_splitter_1_overlap() -> Result<(), TokenizerError> {
        let doc = "This is a sample text that will be split into chunks based on tokens.";
        let max_tokens_per_chunk = 4;
        let chunk_overlap = 1;

        let splitter = NaiveWhitespaceSplitter;

        let chunks = splitter.split_text(doc, max_tokens_per_chunk, chunk_overlap)?;

        assert_eq!(
            chunks,
            vec![
                "This is a sample",
                "sample text that will",
                "will be split into",
                "into chunks based on",
                "on tokens."
            ]
        );

        Ok(())
    }

    #[test]
    fn whitespace_splitter_equal_overlap() -> Result<(), TokenizerError> {
        let doc = "This is a sample text that will be split into chunks based on tokens.";
        let max_tokens_per_chunk = 4;
        let chunk_overlap = max_tokens_per_chunk;

        let splitter = NaiveWhitespaceSplitter;

        let chunks = splitter.split_text(doc, max_tokens_per_chunk, chunk_overlap)?;

        assert_eq!(
            chunks,
            vec![
                "This is a sample",
                "is a sample text",
                "a sample text that",
                "sample text that will",
                "text that will be",
                "that will be split",
                "will be split into",
                "be split into chunks",
                "split into chunks based",
                "into chunks based on",
                "chunks based on tokens.",
                "based on tokens.",
                "on tokens.",
                "tokens."
            ]
        );

        Ok(())
    }
}