1#[derive(Debug, Clone)]
2pub struct ChunkOptions {
3 pub overlap_percentage: u8,
5}
6
7impl Default for ChunkOptions {
8 fn default() -> Self {
9 ChunkOptions {
10 overlap_percentage: 0,
11 }
12 }
13}
14
15pub fn chunk_text(text: &str, chunk_size: usize, options: Option<ChunkOptions>) -> Vec<String> {
44 if text.is_empty() || chunk_size == 0 {
45 return vec![];
46 }
47
48 let mut options = options.unwrap_or_default();
49
50 if options.overlap_percentage > 90 {
52 options.overlap_percentage = 90;
53 }
54
55 let chars: Vec<char> = text.chars().collect();
57 let total_chars = chars.len();
58
59 if total_chars <= chunk_size {
60 return vec![text.to_string()];
61 }
62
63 let mut chunks = Vec::new();
64
65 let overlap_size =
67 ((chunk_size as f64 * options.overlap_percentage as f64 / 100.0) + 0.5) as usize;
68
69 let step_size = if overlap_size >= chunk_size {
71 (chunk_size as f64 * 0.1).ceil() as usize
73 } else {
74 chunk_size - overlap_size
75 };
76
77 let mut start = 0;
78
79 while start < total_chars {
80 let end = std::cmp::min(start + chunk_size, total_chars);
81 let chunk: String = chars[start..end].iter().collect();
82 chunks.push(chunk);
83
84 start += step_size;
86 }
87
88 chunks
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94
95 #[test]
96 fn test_empty_text() {
97 let chunks = chunk_text("", 10, None);
98 assert_eq!(chunks.len(), 0);
99 }
100
101 #[test]
102 fn test_text_smaller_than_chunk() {
103 let text = "Small text";
104 let chunks = chunk_text(text, 20, None);
105 assert_eq!(chunks.len(), 1);
106 assert_eq!(chunks[0], text);
107 }
108
109 #[test]
110 fn test_no_overlap() {
111 let text = "This is a test text. We will split this long text into smaller chunks.";
112 let chunks = chunk_text(text, 10, None);
113 assert_eq!(chunks.len(), 7);
115 assert_eq!(chunks[0], "This is a ");
117 assert_eq!(chunks[1], "test text.");
118 assert_eq!(chunks[2], " We will s");
119 assert_eq!(chunks[3], "plit this ");
120 assert_eq!(chunks[4], "long text ");
121 assert_eq!(chunks[5], "into small");
122 assert_eq!(chunks[6], "er chunks.");
123 }
124
125 #[test]
126 fn test_with_overlap() {
127 let text = "This is a test text. We will split this long text into smaller chunks.";
128 let options = ChunkOptions {
129 overlap_percentage: 50,
130 ..Default::default()
131 };
132 let chunks = chunk_text(text, 10, Some(options));
133 assert_eq!(chunks.len(), 14);
134 assert!(chunks[1].starts_with("is a "));
136 }
137
138 #[test]
139 fn test_max_overlap() {
140 let text = "This is a test text. We will split this long text into smaller chunks.";
141 let options = ChunkOptions {
142 overlap_percentage: 90,
143 ..Default::default()
144 };
145 let chunks = chunk_text(text, 10, Some(options));
146 assert!(chunks.len() > 20);
148
149 let over_max_options = ChunkOptions {
151 overlap_percentage: 100,
152 ..Default::default()
153 };
154 let capped_chunks = chunk_text(text, 10, Some(over_max_options));
155 assert_eq!(chunks.len(), capped_chunks.len());
157 }
158}