1use crate::error::Result;
2
3#[derive(Debug, Clone)]
5pub struct ContentChunk {
6 pub content: String,
7 pub start_byte: usize,
8 pub end_byte: usize,
9 pub chunk_index: usize,
10}
11
12pub struct FileChunker {
14 chunk_size: usize,
15 overlap_size: usize,
16}
17
18impl FileChunker {
19 pub fn new(chunk_size: usize, overlap_size: usize) -> Self {
21 Self {
22 chunk_size,
23 overlap_size,
24 }
25 }
26
27 pub fn with_defaults() -> Self {
29 Self::new(8192, 200)
30 }
31
32 pub fn chunk_content(&self, content: &str) -> Result<Vec<ContentChunk>> {
34 let mut chunks = Vec::new();
35 let bytes = content.as_bytes();
36 let mut start = 0;
37 let mut chunk_index = 0;
38
39 while start < bytes.len() {
40 let end = (start + self.chunk_size).min(bytes.len());
42
43 let safe_end = if end < bytes.len() {
45 let mut boundary = end;
47 while boundary > start && !content.is_char_boundary(boundary) {
48 boundary -= 1;
49 }
50 boundary
51 } else {
52 end
53 };
54
55 let chunk_content = String::from_utf8_lossy(&bytes[start..safe_end]).to_string();
57
58 chunks.push(ContentChunk {
59 content: chunk_content,
60 start_byte: start,
61 end_byte: safe_end,
62 chunk_index,
63 });
64
65 if safe_end >= bytes.len() {
67 break;
68 }
69
70 let overlap_start = if safe_end > self.overlap_size {
72 safe_end - self.overlap_size
73 } else {
74 safe_end
75 };
76
77 let mut safe_overlap_start = overlap_start;
79 while safe_overlap_start < safe_end && !content.is_char_boundary(safe_overlap_start) {
80 safe_overlap_start += 1;
81 }
82
83 start = safe_overlap_start;
84 chunk_index += 1;
85 }
86
87 Ok(chunks)
88 }
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94
95 #[test]
96 fn test_basic_chunking() {
97 let chunker = FileChunker::new(100, 20);
98 let content = "a".repeat(250);
99 let chunks = chunker.chunk_content(&content).unwrap();
100
101 assert!(chunks.len() >= 3);
102 assert_eq!(chunks[0].chunk_index, 0);
103 assert_eq!(chunks[1].chunk_index, 1);
104 }
105
106 #[test]
107 fn test_utf8_boundary_safety() {
108 let chunker = FileChunker::new(10, 2);
109 let content = "Hello 世界 World";
110 let chunks = chunker.chunk_content(&content).unwrap();
111
112 for chunk in chunks {
114 assert!(
115 chunk.content.is_ascii()
116 || chunk
117 .content
118 .chars()
119 .all(|c| c.is_alphabetic() || c.is_whitespace())
120 );
121 }
122 }
123
124 #[test]
125 fn test_overlap() {
126 let chunker = FileChunker::new(50, 10);
127 let content = "a".repeat(100);
128 let chunks = chunker.chunk_content(&content).unwrap();
129
130 if chunks.len() > 1 {
132 let overlap_start = chunks[1].start_byte;
133 let first_end = chunks[0].end_byte;
134 assert!(overlap_start < first_end);
135 }
136 }
137}