chunk_text

Function chunk_text 

Source
pub fn chunk_text(text: &str, language: Option<Language>) -> Result<Vec<Chunk>>
Examples found in repository?
examples/debug_chunks.rs (line 22)
15fn test_file(path: &str, language: Language) {
16    println!("=== Testing {} ===", path);
17    let code = std::fs::read_to_string(path).unwrap_or_else(|_| panic!("Failed to read {}", path));
18
19    println!("File length: {} characters", code.len());
20    println!("Estimated tokens: {}", estimate_tokens(&code));
21
22    let chunks = chunk_text(&code, Some(language)).expect("Failed to chunk code");
23
24    println!("\nGenerated {} chunks:", chunks.len());
25
26    let mut over_limit = 0;
27    for (i, chunk) in chunks.iter().enumerate() {
28        let token_estimate = estimate_tokens(&chunk.text);
29        println!(
30            "Chunk {}: {} chars, ~{} tokens, type: {:?}",
31            i + 1,
32            chunk.text.len(),
33            token_estimate,
34            chunk.chunk_type
35        );
36
37        if token_estimate > 512 {
38            over_limit += 1;
39            println!("  ❌ WARNING: Chunk exceeds 512 token limit!");
40        } else {
41            println!("  ✅ Within 512 token limit");
42        }
43
44        // Show first few lines of the chunk
45        let lines: Vec<&str> = chunk.text.lines().take(3).collect();
46        for line in lines {
47            println!("  {}", line);
48        }
49        if chunk.text.lines().count() > 3 {
50            println!("  ...");
51        }
52        println!();
53    }
54
55    println!(
56        "Summary: {}/{} chunks exceed 512 token limit\n",
57        over_limit,
58        chunks.len()
59    );
60}
61
62#[allow(dead_code)]
63fn test_striding() {
64    println!("=== Testing Striding Functionality ===");
65
66    // Configure striding with a small limit to test the mechanism
67    let config = ChunkConfig {
68        max_tokens: 200,    // Very small limit to trigger striding
69        stride_overlap: 50, // 25% overlap
70        enable_striding: true,
71    };
72
73    let code = std::fs::read_to_string("examples/code/large_function.py")
74        .expect("Failed to read large Python file");
75
76    println!(
77        "Original file: {} chars, ~{} tokens",
78        code.len(),
79        estimate_tokens(&code)
80    );
81
82    let chunks = chunk_text_with_config(&code, Some(Language::Python), &config)
83        .expect("Failed to chunk with striding");
84
85    println!("\nWith striding (limit: {} tokens):", config.max_tokens);
86    println!("Generated {} chunks:", chunks.len());
87
88    let mut strided_count = 0;
89    for (i, chunk) in chunks.iter().enumerate() {
90        let token_estimate = estimate_tokens(&chunk.text);
91        let stride_info = if let Some(ref info) = chunk.stride_info {
92            strided_count += 1;
93            format!(
94                " [STRIDE {}/{} from {}]",
95                info.stride_index + 1,
96                info.total_strides,
97                &info.original_chunk_id[..20]
98            )
99        } else {
100            " [ORIGINAL]".to_string()
101        };
102
103        println!(
104            "Chunk {}: {} chars, ~{} tokens, type: {:?}{}",
105            i + 1,
106            chunk.text.len(),
107            token_estimate,
108            chunk.chunk_type,
109            stride_info
110        );
111
112        if token_estimate > config.max_tokens {
113            println!("  ⚠️  Still exceeds limit! ({})", token_estimate);
114        } else {
115            println!("  ✅ Within limit");
116        }
117    }
118
119    println!(
120        "\nSummary: {}/{} chunks are strided",
121        strided_count,
122        chunks.len()
123    );
124}
125
126fn main() {
127    // Skip file tests that require specific paths, focus on striding
128    println!("=== Striding Test ===");
129
130    // Create a large synthetic function to test striding
131    let large_code = r#"
132def very_large_function():
133    """
134    This is a very large function that will definitely exceed token limits.
135    It contains a lot of logic and comments to make it realistically large.
136    """
137    # Initialize variables
138    result = []
139    config = {"timeout": 30, "retries": 3}
140    
141    # First major section - input validation
142    if not data:
143        print("No data provided")
144        return None
145    
146    for i in range(100):
147        if i % 2 == 0:
148            result.append(f"Even number: {i}")
149        else:
150            result.append(f"Odd number: {i}")
151    
152    # Second major section - processing
153    processed_data = []
154    for item in result:
155        processed_item = item.upper()
156        if len(processed_item) > 10:
157            processed_item = processed_item[:10] + "..."
158        processed_data.append(processed_item)
159    
160    # Third major section - more complex logic
161    final_result = {}
162    for idx, item in enumerate(processed_data):
163        key = f"item_{idx}"
164        final_result[key] = {
165            "value": item,
166            "index": idx,
167            "is_even": idx % 2 == 0,
168            "length": len(item)
169        }
170    
171    # Fourth major section - validation and cleanup
172    cleaned_result = {}
173    for key, value in final_result.items():
174        if value["length"] > 5:
175            cleaned_result[key] = value
176    
177    # Fifth major section - return processing
178    if len(cleaned_result) == 0:
179        return {"status": "empty", "count": 0}
180    
181    return {
182        "status": "success", 
183        "count": len(cleaned_result),
184        "data": cleaned_result,
185        "metadata": {
186            "processed_at": "2024-01-01",
187            "version": "1.0",
188            "algorithm": "basic"
189        }
190    }
191"#;
192
193    println!(
194        "Large synthetic function: {} chars, ~{} tokens",
195        large_code.len(),
196        estimate_tokens(large_code)
197    );
198
199    // Test with normal chunking (no striding)
200    println!("\n=== Without Striding ===");
201    let normal_chunks =
202        chunk_text(large_code, Some(Language::Python)).expect("Failed to chunk without striding");
203
204    println!("Generated {} chunks:", normal_chunks.len());
205    for (i, chunk) in normal_chunks.iter().enumerate() {
206        let tokens = estimate_tokens(&chunk.text);
207        println!(
208            "Chunk {}: {} chars, ~{} tokens",
209            i + 1,
210            chunk.text.len(),
211            tokens
212        );
213    }
214
215    // Test with realistic Nomic model limit (8192 tokens)
216    println!("\n=== With Nomic Model Limits (8192 token limit) ===");
217    let config = ChunkConfig {
218        max_tokens: 8192,     // Nomic model's actual limit
219        stride_overlap: 1024, // 12.5% overlap
220        enable_striding: true,
221    };
222
223    let strided_chunks = chunk_text_with_config(large_code, Some(Language::Python), &config)
224        .expect("Failed to chunk with striding");
225
226    println!("Generated {} chunks:", strided_chunks.len());
227    let mut strided_count = 0;
228    for (i, chunk) in strided_chunks.iter().enumerate() {
229        let tokens = estimate_tokens(&chunk.text);
230        let stride_info = if chunk.stride_info.is_some() {
231            strided_count += 1;
232            " [STRIDED]"
233        } else {
234            " [ORIGINAL]"
235        };
236
237        println!(
238            "Chunk {}: {} chars, ~{} tokens{}",
239            i + 1,
240            chunk.text.len(),
241            tokens,
242            stride_info
243        );
244
245        if tokens > config.max_tokens {
246            println!("  ❌ Still exceeds 8192 token limit!");
247        } else {
248            println!("  ✅ Fits in Nomic model context window");
249        }
250    }
251
252    println!(
253        "\nResult: {}/{} chunks are strided",
254        strided_count,
255        strided_chunks.len()
256    );
257}