overlap_chunk/
lib.rs

1#[derive(Debug, Clone)]
2pub struct ChunkOptions {
3    /// Overlap percentage (0-90%)
4    pub overlap_percentage: u8,
5}
6
7impl Default for ChunkOptions {
8    fn default() -> Self {
9        ChunkOptions {
10            overlap_percentage: 0,
11        }
12    }
13}
14
15/// Split text into chunks of specified size
16///
17/// # Arguments
18///
19/// * `text` - The text to be split
20/// * `chunk_size` - Maximum size of each chunk (in characters)
21/// * `options` - Optional chunking options
22///
23/// # Returns
24///
25/// A vector of text chunks
26///
27/// # Example
28///
29/// ```
30/// use overlap_chunk::{chunk_text, ChunkOptions};
31///
32/// let text = "This is a test text. We will split this long text into smaller chunks.";
33/// let chunks = chunk_text(text, 10, None);
34/// assert_eq!(chunks.len(), 7);
35///
36/// let options = ChunkOptions {
37///     overlap_percentage: 50,
38///     ..Default::default()
39/// };
40/// let chunks_with_overlap = chunk_text(text, 10, Some(options));
41/// assert_eq!(chunks_with_overlap.len(), 14);
42/// ```
43pub fn chunk_text(text: &str, chunk_size: usize, options: Option<ChunkOptions>) -> Vec<String> {
44    if text.is_empty() || chunk_size == 0 {
45        return vec![];
46    }
47
48    let mut options = options.unwrap_or_default();
49
50    // Limit overlap to 90%
51    if options.overlap_percentage > 90 {
52        options.overlap_percentage = 90;
53    }
54
55    // Convert to character vector for proper handling
56    let chars: Vec<char> = text.chars().collect();
57    let total_chars = chars.len();
58
59    if total_chars <= chunk_size {
60        return vec![text.to_string()];
61    }
62
63    let mut chunks = Vec::new();
64
65    // Calculate overlap size
66    let overlap_size =
67        ((chunk_size as f64 * options.overlap_percentage as f64 / 100.0) + 0.5) as usize;
68
69    // Calculate step size considering overlap
70    let step_size = if overlap_size >= chunk_size {
71        // Even with maximum overlap (90%), ensure minimum step size
72        (chunk_size as f64 * 0.1).ceil() as usize
73    } else {
74        chunk_size - overlap_size
75    };
76
77    let mut start = 0;
78
79    while start < total_chars {
80        let end = std::cmp::min(start + chunk_size, total_chars);
81        let chunk: String = chars[start..end].iter().collect();
82        chunks.push(chunk);
83
84        // Calculate start position for next chunk
85        start += step_size;
86    }
87
88    chunks
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94
95    #[test]
96    fn test_empty_text() {
97        let chunks = chunk_text("", 10, None);
98        assert_eq!(chunks.len(), 0);
99    }
100
101    #[test]
102    fn test_text_smaller_than_chunk() {
103        let text = "Small text";
104        let chunks = chunk_text(text, 20, None);
105        assert_eq!(chunks.len(), 1);
106        assert_eq!(chunks[0], text);
107    }
108
109    #[test]
110    fn test_no_overlap() {
111        let text = "This is a test text. We will split this long text into smaller chunks.";
112        let chunks = chunk_text(text, 10, None);
113        // Check the exact number of chunks
114        assert_eq!(chunks.len(), 7);
115        // Check the content of each chunk
116        assert_eq!(chunks[0], "This is a ");
117        assert_eq!(chunks[1], "test text.");
118        assert_eq!(chunks[2], " We will s");
119        assert_eq!(chunks[3], "plit this ");
120        assert_eq!(chunks[4], "long text ");
121        assert_eq!(chunks[5], "into small");
122        assert_eq!(chunks[6], "er chunks.");
123    }
124
125    #[test]
126    fn test_with_overlap() {
127        let text = "This is a test text. We will split this long text into smaller chunks.";
128        let options = ChunkOptions {
129            overlap_percentage: 50,
130            ..Default::default()
131        };
132        let chunks = chunk_text(text, 10, Some(options));
133        assert_eq!(chunks.len(), 14);
134        // Check overlap - second chunk should start with "is a"
135        assert!(chunks[1].starts_with("is a "));
136    }
137
138    #[test]
139    fn test_max_overlap() {
140        let text = "This is a test text. We will split this long text into smaller chunks.";
141        let options = ChunkOptions {
142            overlap_percentage: 90,
143            ..Default::default()
144        };
145        let chunks = chunk_text(text, 10, Some(options));
146        // With 90% overlap, step size is 1, so we should have a lot of chunks
147        assert!(chunks.len() > 20);
148
149        // Test that values over 90% are capped
150        let over_max_options = ChunkOptions {
151            overlap_percentage: 100,
152            ..Default::default()
153        };
154        let capped_chunks = chunk_text(text, 10, Some(over_max_options));
155        // Should be the same as 90% overlap
156        assert_eq!(chunks.len(), capped_chunks.len());
157    }
158}