pub fn chunk_text(text: &str, language: Option<Language>) -> Result<Vec<Chunk>>Examples found in repository?
examples/debug_chunks.rs (line 21)
14fn test_file(path: &str, language: Language) {
15 println!("=== Testing {} ===", path);
16 let code = std::fs::read_to_string(path).expect(&format!("Failed to read {}", path));
17
18 println!("File length: {} characters", code.len());
19 println!("Estimated tokens: {}", estimate_tokens(&code));
20
21 let chunks = chunk_text(&code, Some(language)).expect("Failed to chunk code");
22
23 println!("\nGenerated {} chunks:", chunks.len());
24
25 let mut over_limit = 0;
26 for (i, chunk) in chunks.iter().enumerate() {
27 let token_estimate = estimate_tokens(&chunk.text);
28 println!(
29 "Chunk {}: {} chars, ~{} tokens, type: {:?}",
30 i + 1,
31 chunk.text.len(),
32 token_estimate,
33 chunk.chunk_type
34 );
35
36 if token_estimate > 512 {
37 over_limit += 1;
38 println!(" ❌ WARNING: Chunk exceeds 512 token limit!");
39 } else {
40 println!(" ✅ Within 512 token limit");
41 }
42
43 // Show first few lines of the chunk
44 let lines: Vec<&str> = chunk.text.lines().take(3).collect();
45 for line in lines {
46 println!(" {}", line);
47 }
48 if chunk.text.lines().count() > 3 {
49 println!(" ...");
50 }
51 println!();
52 }
53
54 println!(
55 "Summary: {}/{} chunks exceed 512 token limit\n",
56 over_limit,
57 chunks.len()
58 );
59}
60
61fn test_striding() {
62 println!("=== Testing Striding Functionality ===");
63
64 // Configure striding with a small limit to test the mechanism
65 let config = ChunkConfig {
66 max_tokens: 200, // Very small limit to trigger striding
67 stride_overlap: 50, // 25% overlap
68 enable_striding: true,
69 };
70
71 let code = std::fs::read_to_string("examples/code/large_function.py")
72 .expect("Failed to read large Python file");
73
74 println!(
75 "Original file: {} chars, ~{} tokens",
76 code.len(),
77 estimate_tokens(&code)
78 );
79
80 let chunks = chunk_text_with_config(&code, Some(Language::Python), &config)
81 .expect("Failed to chunk with striding");
82
83 println!("\nWith striding (limit: {} tokens):", config.max_tokens);
84 println!("Generated {} chunks:", chunks.len());
85
86 let mut strided_count = 0;
87 for (i, chunk) in chunks.iter().enumerate() {
88 let token_estimate = estimate_tokens(&chunk.text);
89 let stride_info = if let Some(ref info) = chunk.stride_info {
90 strided_count += 1;
91 format!(
92 " [STRIDE {}/{} from {}]",
93 info.stride_index + 1,
94 info.total_strides,
95 &info.original_chunk_id[..20]
96 )
97 } else {
98 " [ORIGINAL]".to_string()
99 };
100
101 println!(
102 "Chunk {}: {} chars, ~{} tokens, type: {:?}{}",
103 i + 1,
104 chunk.text.len(),
105 token_estimate,
106 chunk.chunk_type,
107 stride_info
108 );
109
110 if token_estimate > config.max_tokens {
111 println!(" ⚠️ Still exceeds limit! ({})", token_estimate);
112 } else {
113 println!(" ✅ Within limit");
114 }
115 }
116
117 println!(
118 "\nSummary: {}/{} chunks are strided",
119 strided_count,
120 chunks.len()
121 );
122}
123
124fn main() {
125 // Skip file tests that require specific paths, focus on striding
126 println!("=== Striding Test ===");
127
128 // Create a large synthetic function to test striding
129 let large_code = r#"
130def very_large_function():
131 """
132 This is a very large function that will definitely exceed token limits.
133 It contains a lot of logic and comments to make it realistically large.
134 """
135 # Initialize variables
136 result = []
137 config = {"timeout": 30, "retries": 3}
138
139 # First major section - input validation
140 if not data:
141 print("No data provided")
142 return None
143
144 for i in range(100):
145 if i % 2 == 0:
146 result.append(f"Even number: {i}")
147 else:
148 result.append(f"Odd number: {i}")
149
150 # Second major section - processing
151 processed_data = []
152 for item in result:
153 processed_item = item.upper()
154 if len(processed_item) > 10:
155 processed_item = processed_item[:10] + "..."
156 processed_data.append(processed_item)
157
158 # Third major section - more complex logic
159 final_result = {}
160 for idx, item in enumerate(processed_data):
161 key = f"item_{idx}"
162 final_result[key] = {
163 "value": item,
164 "index": idx,
165 "is_even": idx % 2 == 0,
166 "length": len(item)
167 }
168
169 # Fourth major section - validation and cleanup
170 cleaned_result = {}
171 for key, value in final_result.items():
172 if value["length"] > 5:
173 cleaned_result[key] = value
174
175 # Fifth major section - return processing
176 if len(cleaned_result) == 0:
177 return {"status": "empty", "count": 0}
178
179 return {
180 "status": "success",
181 "count": len(cleaned_result),
182 "data": cleaned_result,
183 "metadata": {
184 "processed_at": "2024-01-01",
185 "version": "1.0",
186 "algorithm": "basic"
187 }
188 }
189"#;
190
191 println!(
192 "Large synthetic function: {} chars, ~{} tokens",
193 large_code.len(),
194 estimate_tokens(large_code)
195 );
196
197 // Test with normal chunking (no striding)
198 println!("\n=== Without Striding ===");
199 let normal_chunks =
200 chunk_text(large_code, Some(Language::Python)).expect("Failed to chunk without striding");
201
202 println!("Generated {} chunks:", normal_chunks.len());
203 for (i, chunk) in normal_chunks.iter().enumerate() {
204 let tokens = estimate_tokens(&chunk.text);
205 println!(
206 "Chunk {}: {} chars, ~{} tokens",
207 i + 1,
208 chunk.text.len(),
209 tokens
210 );
211 }
212
213 // Test with realistic Nomic model limit (8192 tokens)
214 println!("\n=== With Nomic Model Limits (8192 token limit) ===");
215 let config = ChunkConfig {
216 max_tokens: 8192, // Nomic model's actual limit
217 stride_overlap: 1024, // 12.5% overlap
218 enable_striding: true,
219 };
220
221 let strided_chunks = chunk_text_with_config(large_code, Some(Language::Python), &config)
222 .expect("Failed to chunk with striding");
223
224 println!("Generated {} chunks:", strided_chunks.len());
225 let mut strided_count = 0;
226 for (i, chunk) in strided_chunks.iter().enumerate() {
227 let tokens = estimate_tokens(&chunk.text);
228 let stride_info = if chunk.stride_info.is_some() {
229 strided_count += 1;
230 " [STRIDED]"
231 } else {
232 " [ORIGINAL]"
233 };
234
235 println!(
236 "Chunk {}: {} chars, ~{} tokens{}",
237 i + 1,
238 chunk.text.len(),
239 tokens,
240 stride_info
241 );
242
243 if tokens > config.max_tokens {
244 println!(" ❌ Still exceeds 8192 token limit!");
245 } else {
246 println!(" ✅ Fits in Nomic model context window");
247 }
248 }
249
250 println!(
251 "\nResult: {}/{} chunks are strided",
252 strided_count,
253 strided_chunks.len()
254 );
255}