pub fn chunk_text(text: &str, language: Option<Language>) -> Result<Vec<Chunk>>Examples found in repository?
examples/debug_chunks.rs (line 22)
15fn test_file(path: &str, language: Language) {
16 println!("=== Testing {} ===", path);
17 let code = std::fs::read_to_string(path).unwrap_or_else(|_| panic!("Failed to read {}", path));
18
19 println!("File length: {} characters", code.len());
20 println!("Estimated tokens: {}", estimate_tokens(&code));
21
22 let chunks = chunk_text(&code, Some(language)).expect("Failed to chunk code");
23
24 println!("\nGenerated {} chunks:", chunks.len());
25
26 let mut over_limit = 0;
27 for (i, chunk) in chunks.iter().enumerate() {
28 let token_estimate = estimate_tokens(&chunk.text);
29 println!(
30 "Chunk {}: {} chars, ~{} tokens, type: {:?}",
31 i + 1,
32 chunk.text.len(),
33 token_estimate,
34 chunk.chunk_type
35 );
36
37 if token_estimate > 512 {
38 over_limit += 1;
39 println!(" ❌ WARNING: Chunk exceeds 512 token limit!");
40 } else {
41 println!(" ✅ Within 512 token limit");
42 }
43
44 // Show first few lines of the chunk
45 let lines: Vec<&str> = chunk.text.lines().take(3).collect();
46 for line in lines {
47 println!(" {}", line);
48 }
49 if chunk.text.lines().count() > 3 {
50 println!(" ...");
51 }
52 println!();
53 }
54
55 println!(
56 "Summary: {}/{} chunks exceed 512 token limit\n",
57 over_limit,
58 chunks.len()
59 );
60}
61
62#[allow(dead_code)]
63fn test_striding() {
64 println!("=== Testing Striding Functionality ===");
65
66 // Configure striding with a small limit to test the mechanism
67 let config = ChunkConfig {
68 max_tokens: 200, // Very small limit to trigger striding
69 stride_overlap: 50, // 25% overlap
70 enable_striding: true,
71 };
72
73 let code = std::fs::read_to_string("examples/code/large_function.py")
74 .expect("Failed to read large Python file");
75
76 println!(
77 "Original file: {} chars, ~{} tokens",
78 code.len(),
79 estimate_tokens(&code)
80 );
81
82 let chunks = chunk_text_with_config(&code, Some(Language::Python), &config)
83 .expect("Failed to chunk with striding");
84
85 println!("\nWith striding (limit: {} tokens):", config.max_tokens);
86 println!("Generated {} chunks:", chunks.len());
87
88 let mut strided_count = 0;
89 for (i, chunk) in chunks.iter().enumerate() {
90 let token_estimate = estimate_tokens(&chunk.text);
91 let stride_info = if let Some(ref info) = chunk.stride_info {
92 strided_count += 1;
93 format!(
94 " [STRIDE {}/{} from {}]",
95 info.stride_index + 1,
96 info.total_strides,
97 &info.original_chunk_id[..20]
98 )
99 } else {
100 " [ORIGINAL]".to_string()
101 };
102
103 println!(
104 "Chunk {}: {} chars, ~{} tokens, type: {:?}{}",
105 i + 1,
106 chunk.text.len(),
107 token_estimate,
108 chunk.chunk_type,
109 stride_info
110 );
111
112 if token_estimate > config.max_tokens {
113 println!(" ⚠️ Still exceeds limit! ({})", token_estimate);
114 } else {
115 println!(" ✅ Within limit");
116 }
117 }
118
119 println!(
120 "\nSummary: {}/{} chunks are strided",
121 strided_count,
122 chunks.len()
123 );
124}
125
126fn main() {
127 // Skip file tests that require specific paths, focus on striding
128 println!("=== Striding Test ===");
129
130 // Create a large synthetic function to test striding
131 let large_code = r#"
132def very_large_function():
133 """
134 This is a very large function that will definitely exceed token limits.
135 It contains a lot of logic and comments to make it realistically large.
136 """
137 # Initialize variables
138 result = []
139 config = {"timeout": 30, "retries": 3}
140
141 # First major section - input validation
142 if not data:
143 print("No data provided")
144 return None
145
146 for i in range(100):
147 if i % 2 == 0:
148 result.append(f"Even number: {i}")
149 else:
150 result.append(f"Odd number: {i}")
151
152 # Second major section - processing
153 processed_data = []
154 for item in result:
155 processed_item = item.upper()
156 if len(processed_item) > 10:
157 processed_item = processed_item[:10] + "..."
158 processed_data.append(processed_item)
159
160 # Third major section - more complex logic
161 final_result = {}
162 for idx, item in enumerate(processed_data):
163 key = f"item_{idx}"
164 final_result[key] = {
165 "value": item,
166 "index": idx,
167 "is_even": idx % 2 == 0,
168 "length": len(item)
169 }
170
171 # Fourth major section - validation and cleanup
172 cleaned_result = {}
173 for key, value in final_result.items():
174 if value["length"] > 5:
175 cleaned_result[key] = value
176
177 # Fifth major section - return processing
178 if len(cleaned_result) == 0:
179 return {"status": "empty", "count": 0}
180
181 return {
182 "status": "success",
183 "count": len(cleaned_result),
184 "data": cleaned_result,
185 "metadata": {
186 "processed_at": "2024-01-01",
187 "version": "1.0",
188 "algorithm": "basic"
189 }
190 }
191"#;
192
193 println!(
194 "Large synthetic function: {} chars, ~{} tokens",
195 large_code.len(),
196 estimate_tokens(large_code)
197 );
198
199 // Test with normal chunking (no striding)
200 println!("\n=== Without Striding ===");
201 let normal_chunks =
202 chunk_text(large_code, Some(Language::Python)).expect("Failed to chunk without striding");
203
204 println!("Generated {} chunks:", normal_chunks.len());
205 for (i, chunk) in normal_chunks.iter().enumerate() {
206 let tokens = estimate_tokens(&chunk.text);
207 println!(
208 "Chunk {}: {} chars, ~{} tokens",
209 i + 1,
210 chunk.text.len(),
211 tokens
212 );
213 }
214
215 // Test with realistic Nomic model limit (8192 tokens)
216 println!("\n=== With Nomic Model Limits (8192 token limit) ===");
217 let config = ChunkConfig {
218 max_tokens: 8192, // Nomic model's actual limit
219 stride_overlap: 1024, // 12.5% overlap
220 enable_striding: true,
221 };
222
223 let strided_chunks = chunk_text_with_config(large_code, Some(Language::Python), &config)
224 .expect("Failed to chunk with striding");
225
226 println!("Generated {} chunks:", strided_chunks.len());
227 let mut strided_count = 0;
228 for (i, chunk) in strided_chunks.iter().enumerate() {
229 let tokens = estimate_tokens(&chunk.text);
230 let stride_info = if chunk.stride_info.is_some() {
231 strided_count += 1;
232 " [STRIDED]"
233 } else {
234 " [ORIGINAL]"
235 };
236
237 println!(
238 "Chunk {}: {} chars, ~{} tokens{}",
239 i + 1,
240 chunk.text.len(),
241 tokens,
242 stride_info
243 );
244
245 if tokens > config.max_tokens {
246 println!(" ❌ Still exceeds 8192 token limit!");
247 } else {
248 println!(" ✅ Fits in Nomic model context window");
249 }
250 }
251
252 println!(
253 "\nResult: {}/{} chunks are strided",
254 strided_count,
255 strided_chunks.len()
256 );
257}