1use crate::error::Result;
2
3#[derive(Debug, Clone)]
5pub struct ContentChunk {
6 pub content: String,
7 pub start_byte: usize,
8 pub end_byte: usize,
9 pub chunk_index: usize,
10}
11
12pub struct FileChunker {
14 chunk_size: usize,
15 overlap_size: usize,
16}
17
18impl FileChunker {
19 pub fn new(chunk_size: usize, overlap_size: usize) -> Self {
21 Self {
22 chunk_size,
23 overlap_size,
24 }
25 }
26
27 pub fn with_defaults() -> Self {
29 Self::new(8192, 200)
30 }
31
32 pub fn chunk_content(&self, content: &str) -> Result<Vec<ContentChunk>> {
34 let mut chunks = Vec::new();
35 let bytes = content.as_bytes();
36 let mut start = 0;
37 let mut chunk_index = 0;
38
39 while start < bytes.len() {
40 let end = (start + self.chunk_size).min(bytes.len());
42
43 let safe_end = if end < bytes.len() {
45 let mut boundary = end;
47 while boundary > start && !content.is_char_boundary(boundary) {
48 boundary -= 1;
49 }
50 boundary
51 } else {
52 end
53 };
54
55 let chunk_content = String::from_utf8_lossy(&bytes[start..safe_end]).to_string();
57
58 chunks.push(ContentChunk {
59 content: chunk_content,
60 start_byte: start,
61 end_byte: safe_end,
62 chunk_index,
63 });
64
65 if safe_end >= bytes.len() {
67 break;
68 }
69
70 let overlap_start = if safe_end > self.overlap_size {
72 safe_end - self.overlap_size
73 } else {
74 safe_end
75 };
76
77 let mut safe_overlap_start = overlap_start;
79 while safe_overlap_start < safe_end && !content.is_char_boundary(safe_overlap_start) {
80 safe_overlap_start += 1;
81 }
82
83 start = safe_overlap_start;
84 chunk_index += 1;
85 }
86
87 Ok(chunks)
88 }
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94
95 #[test]
96 #[allow(clippy::needless_borrow)]
97 fn test_basic_chunking() {
98 let chunker = FileChunker::new(100, 20);
99 let content = "a".repeat(250);
100 let chunks = chunker.chunk_content(&content).unwrap();
101
102 assert!(chunks.len() >= 3);
103 assert_eq!(chunks[0].chunk_index, 0);
104 assert_eq!(chunks[1].chunk_index, 1);
105 }
106
107 #[test]
108 #[allow(clippy::needless_borrow)]
109 fn test_utf8_boundary_safety() {
110 let chunker = FileChunker::new(10, 2);
111 let content = "Hello 世界 World";
112 let chunks = chunker.chunk_content(&content).unwrap();
113
114 for chunk in chunks {
116 assert!(
117 chunk.content.is_ascii()
118 || chunk
119 .content
120 .chars()
121 .all(|c| c.is_alphabetic() || c.is_whitespace())
122 );
123 }
124 }
125
126 #[test]
127 #[allow(clippy::needless_borrow)]
128 fn test_overlap() {
129 let chunker = FileChunker::new(50, 10);
130 let content = "a".repeat(100);
131 let chunks = chunker.chunk_content(&content).unwrap();
132
133 if chunks.len() > 1 {
135 let overlap_start = chunks[1].start_byte;
136 let first_end = chunks[0].end_byte;
137 assert!(overlap_start < first_end);
138 }
139 }
140}