chunk_text

Function chunk_text 

Source
pub fn chunk_text(text: &str, language: Option<Language>) -> Result<Vec<Chunk>>
Examples found in repository?
examples/debug_chunks.rs (line 21)
14fn test_file(path: &str, language: Language) {
15    println!("=== Testing {} ===", path);
16    let code = std::fs::read_to_string(path).expect(&format!("Failed to read {}", path));
17
18    println!("File length: {} characters", code.len());
19    println!("Estimated tokens: {}", estimate_tokens(&code));
20
21    let chunks = chunk_text(&code, Some(language)).expect("Failed to chunk code");
22
23    println!("\nGenerated {} chunks:", chunks.len());
24
25    let mut over_limit = 0;
26    for (i, chunk) in chunks.iter().enumerate() {
27        let token_estimate = estimate_tokens(&chunk.text);
28        println!(
29            "Chunk {}: {} chars, ~{} tokens, type: {:?}",
30            i + 1,
31            chunk.text.len(),
32            token_estimate,
33            chunk.chunk_type
34        );
35
36        if token_estimate > 512 {
37            over_limit += 1;
38            println!("  ❌ WARNING: Chunk exceeds 512 token limit!");
39        } else {
40            println!("  ✅ Within 512 token limit");
41        }
42
43        // Show first few lines of the chunk
44        let lines: Vec<&str> = chunk.text.lines().take(3).collect();
45        for line in lines {
46            println!("  {}", line);
47        }
48        if chunk.text.lines().count() > 3 {
49            println!("  ...");
50        }
51        println!();
52    }
53
54    println!(
55        "Summary: {}/{} chunks exceed 512 token limit\n",
56        over_limit,
57        chunks.len()
58    );
59}
60
61fn test_striding() {
62    println!("=== Testing Striding Functionality ===");
63
64    // Configure striding with a small limit to test the mechanism
65    let config = ChunkConfig {
66        max_tokens: 200,    // Very small limit to trigger striding
67        stride_overlap: 50, // 25% overlap
68        enable_striding: true,
69    };
70
71    let code = std::fs::read_to_string("examples/code/large_function.py")
72        .expect("Failed to read large Python file");
73
74    println!(
75        "Original file: {} chars, ~{} tokens",
76        code.len(),
77        estimate_tokens(&code)
78    );
79
80    let chunks = chunk_text_with_config(&code, Some(Language::Python), &config)
81        .expect("Failed to chunk with striding");
82
83    println!("\nWith striding (limit: {} tokens):", config.max_tokens);
84    println!("Generated {} chunks:", chunks.len());
85
86    let mut strided_count = 0;
87    for (i, chunk) in chunks.iter().enumerate() {
88        let token_estimate = estimate_tokens(&chunk.text);
89        let stride_info = if let Some(ref info) = chunk.stride_info {
90            strided_count += 1;
91            format!(
92                " [STRIDE {}/{} from {}]",
93                info.stride_index + 1,
94                info.total_strides,
95                &info.original_chunk_id[..20]
96            )
97        } else {
98            " [ORIGINAL]".to_string()
99        };
100
101        println!(
102            "Chunk {}: {} chars, ~{} tokens, type: {:?}{}",
103            i + 1,
104            chunk.text.len(),
105            token_estimate,
106            chunk.chunk_type,
107            stride_info
108        );
109
110        if token_estimate > config.max_tokens {
111            println!("  ⚠️  Still exceeds limit! ({})", token_estimate);
112        } else {
113            println!("  ✅ Within limit");
114        }
115    }
116
117    println!(
118        "\nSummary: {}/{} chunks are strided",
119        strided_count,
120        chunks.len()
121    );
122}
123
124fn main() {
125    // Skip file tests that require specific paths, focus on striding
126    println!("=== Striding Test ===");
127
128    // Create a large synthetic function to test striding
129    let large_code = r#"
130def very_large_function():
131    """
132    This is a very large function that will definitely exceed token limits.
133    It contains a lot of logic and comments to make it realistically large.
134    """
135    # Initialize variables
136    result = []
137    config = {"timeout": 30, "retries": 3}
138    
139    # First major section - input validation
140    if not data:
141        print("No data provided")
142        return None
143    
144    for i in range(100):
145        if i % 2 == 0:
146            result.append(f"Even number: {i}")
147        else:
148            result.append(f"Odd number: {i}")
149    
150    # Second major section - processing
151    processed_data = []
152    for item in result:
153        processed_item = item.upper()
154        if len(processed_item) > 10:
155            processed_item = processed_item[:10] + "..."
156        processed_data.append(processed_item)
157    
158    # Third major section - more complex logic
159    final_result = {}
160    for idx, item in enumerate(processed_data):
161        key = f"item_{idx}"
162        final_result[key] = {
163            "value": item,
164            "index": idx,
165            "is_even": idx % 2 == 0,
166            "length": len(item)
167        }
168    
169    # Fourth major section - validation and cleanup
170    cleaned_result = {}
171    for key, value in final_result.items():
172        if value["length"] > 5:
173            cleaned_result[key] = value
174    
175    # Fifth major section - return processing
176    if len(cleaned_result) == 0:
177        return {"status": "empty", "count": 0}
178    
179    return {
180        "status": "success", 
181        "count": len(cleaned_result),
182        "data": cleaned_result,
183        "metadata": {
184            "processed_at": "2024-01-01",
185            "version": "1.0",
186            "algorithm": "basic"
187        }
188    }
189"#;
190
191    println!(
192        "Large synthetic function: {} chars, ~{} tokens",
193        large_code.len(),
194        estimate_tokens(large_code)
195    );
196
197    // Test with normal chunking (no striding)
198    println!("\n=== Without Striding ===");
199    let normal_chunks =
200        chunk_text(large_code, Some(Language::Python)).expect("Failed to chunk without striding");
201
202    println!("Generated {} chunks:", normal_chunks.len());
203    for (i, chunk) in normal_chunks.iter().enumerate() {
204        let tokens = estimate_tokens(&chunk.text);
205        println!(
206            "Chunk {}: {} chars, ~{} tokens",
207            i + 1,
208            chunk.text.len(),
209            tokens
210        );
211    }
212
213    // Test with realistic Nomic model limit (8192 tokens)
214    println!("\n=== With Nomic Model Limits (8192 token limit) ===");
215    let config = ChunkConfig {
216        max_tokens: 8192,     // Nomic model's actual limit
217        stride_overlap: 1024, // 12.5% overlap
218        enable_striding: true,
219    };
220
221    let strided_chunks = chunk_text_with_config(large_code, Some(Language::Python), &config)
222        .expect("Failed to chunk with striding");
223
224    println!("Generated {} chunks:", strided_chunks.len());
225    let mut strided_count = 0;
226    for (i, chunk) in strided_chunks.iter().enumerate() {
227        let tokens = estimate_tokens(&chunk.text);
228        let stride_info = if chunk.stride_info.is_some() {
229            strided_count += 1;
230            " [STRIDED]"
231        } else {
232            " [ORIGINAL]"
233        };
234
235        println!(
236            "Chunk {}: {} chars, ~{} tokens{}",
237            i + 1,
238            chunk.text.len(),
239            tokens,
240            stride_info
241        );
242
243        if tokens > config.max_tokens {
244            println!("  ❌ Still exceeds 8192 token limit!");
245        } else {
246            println!("  ✅ Fits in Nomic model context window");
247        }
248    }
249
250    println!(
251        "\nResult: {}/{} chunks are strided",
252        strided_count,
253        strided_chunks.len()
254    );
255}