aprender-orchestrate 0.30.0

Sovereign AI orchestration: autonomous agents, ML serving, code analysis, and transpilation pipelines
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
//! Semantic Chunker for Code-Aware Text Splitting
//!
//! Implements recursive character splitting with code-aware boundaries.
//! Based on LangChain text splitter patterns [1] and Chen et al. (2017) [21].

use super::fingerprint::ChunkerConfig;

/// Semantic chunker with code-aware splitting
///
/// Uses recursive character splitting with Rust/Markdown-aware separators.
/// Preserves context through configurable overlap.
#[derive(Debug, Clone)]
pub struct SemanticChunker {
    /// Target chunk size in characters (approximates tokens)
    chunk_size: usize,
    /// Overlap between chunks for context preservation
    chunk_overlap: usize,
    /// Code-aware separators ordered by priority (highest to lowest)
    separators: Vec<String>,
}

impl SemanticChunker {
    /// Create a new semantic chunker with custom settings
    pub fn new(chunk_size: usize, chunk_overlap: usize, separators: Vec<String>) -> Self {
        Self { chunk_size, chunk_overlap, separators }
    }

    /// Create from a ChunkerConfig
    pub fn from_config(config: &ChunkerConfig) -> Self {
        Self {
            chunk_size: config.chunk_size,
            chunk_overlap: config.chunk_overlap,
            separators: Self::default_separators(),
        }
    }

    /// Default separators for Rust/Markdown content
    fn default_separators() -> Vec<String> {
        vec![
            "\n## ".to_string(),     // Markdown H2
            "\n### ".to_string(),    // Markdown H3
            "\n#### ".to_string(),   // Markdown H4
            "\nfn ".to_string(),     // Rust function
            "\npub fn ".to_string(), // Rust public function
            "\nimpl ".to_string(),   // Rust impl block
            "\nstruct ".to_string(), // Rust struct
            "\nenum ".to_string(),   // Rust enum
            "\nmod ".to_string(),    // Rust module
            "\n```".to_string(),     // Code fence
            "\n\n".to_string(),      // Paragraph
            "\n".to_string(),        // Line
            " ".to_string(),         // Word
        ]
    }

    /// Split text into chunks
    pub fn split(&self, text: &str) -> Vec<Chunk> {
        let mut chunks = Vec::new();
        let mut current_pos = 0;

        while current_pos < text.len() {
            let (chunk_text, end_pos) = self.extract_chunk(text, current_pos);

            if !chunk_text.trim().is_empty() {
                let start_line = text[..current_pos].matches('\n').count() + 1;
                let end_line = start_line + chunk_text.matches('\n').count();

                chunks.push(Chunk {
                    content: chunk_text.to_string(),
                    start_offset: current_pos,
                    end_offset: end_pos,
                    start_line,
                    end_line,
                });
            }

            // Move forward, accounting for overlap
            let advance = if end_pos - current_pos > self.chunk_overlap {
                end_pos - current_pos - self.chunk_overlap
            } else {
                end_pos - current_pos
            };

            // Advance at least 1, ensuring we land on a char boundary
            let new_pos = current_pos + advance.max(1);
            current_pos = Self::find_next_char_boundary(text, new_pos);
        }

        chunks
    }

    /// Extract a single chunk starting at position
    fn extract_chunk(&self, text: &str, start: usize) -> (String, usize) {
        let remaining = &text[start..];
        let target_end = Self::find_char_boundary(text, (start + self.chunk_size).min(text.len()));

        // If remaining text fits in one chunk, return it all
        if start + remaining.len() <= target_end {
            return (remaining.to_string(), text.len());
        }

        // Find the best split point using separators
        let search_region = &text[start..target_end];

        for separator in &self.separators {
            if let Some(pos) = search_region.rfind(separator.as_str()) {
                if pos > 0 {
                    // Include the separator in the chunk
                    let end = start + pos + separator.len();
                    return (text[start..end].to_string(), end);
                }
            }
        }

        // No separator found, hard cut at nearest char boundary
        (text[start..target_end].to_string(), target_end)
    }

    /// Find the nearest valid UTF-8 character boundary at or before the given position
    fn find_char_boundary(text: &str, pos: usize) -> usize {
        if pos >= text.len() {
            return text.len();
        }
        // Walk backwards to find a char boundary
        let mut boundary = pos;
        while boundary > 0 && !text.is_char_boundary(boundary) {
            boundary -= 1;
        }
        boundary
    }

    /// Find the next valid UTF-8 character boundary at or after the given position
    fn find_next_char_boundary(text: &str, pos: usize) -> usize {
        if pos >= text.len() {
            return text.len();
        }
        // Walk forwards to find a char boundary
        let mut boundary = pos;
        while boundary < text.len() && !text.is_char_boundary(boundary) {
            boundary += 1;
        }
        boundary
    }

    /// Get configuration as ChunkerConfig
    pub fn config(&self) -> ChunkerConfig {
        let sep_refs: Vec<&str> = self.separators.iter().map(|s| s.as_str()).collect();
        ChunkerConfig::new(self.chunk_size, self.chunk_overlap, &sep_refs)
    }
}

impl Default for SemanticChunker {
    fn default() -> Self {
        Self { chunk_size: 512, chunk_overlap: 64, separators: Self::default_separators() }
    }
}

/// A text chunk with position metadata
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
    /// Chunk content
    pub content: String,
    /// Start byte offset in source
    pub start_offset: usize,
    /// End byte offset in source
    pub end_offset: usize,
    /// Start line number (1-indexed)
    pub start_line: usize,
    /// End line number (1-indexed)
    pub end_line: usize,
}

impl Chunk {
    /// Get content hash for deduplication
    pub fn content_hash(&self) -> [u8; 32] {
        // Use same hash function as fingerprint
        let mut hash = [0u8; 32];
        let mut state: u64 = 0xcbf2_9ce4_8422_2325;
        for &byte in self.content.as_bytes() {
            state ^= byte as u64;
            state = state.wrapping_mul(0x0100_0000_01b3);
        }
        for i in 0..4 {
            let chunk = state.wrapping_add(i as u64).to_le_bytes();
            hash[i * 8..(i + 1) * 8].copy_from_slice(&chunk);
        }
        hash
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_chunker_default() {
        let chunker = SemanticChunker::default();
        assert_eq!(chunker.chunk_size, 512);
        assert_eq!(chunker.chunk_overlap, 64);
        assert!(!chunker.separators.is_empty());
    }

    #[test]
    fn test_split_short_text() {
        let chunker = SemanticChunker::default();
        let text = "Short text";
        let chunks = chunker.split(text);

        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].content, "Short text");
        assert_eq!(chunks[0].start_line, 1);
        assert_eq!(chunks[0].end_line, 1);
    }

    #[test]
    fn test_split_markdown_headers() {
        let chunker = SemanticChunker::new(100, 10, vec!["\n## ".to_string()]);
        let text = "# Title\n\nIntro paragraph.\n\n## Section 1\n\nContent 1.\n\n## Section 2\n\nContent 2.";

        let chunks = chunker.split(text);

        // Should split at ## headers
        assert!(chunks.len() >= 2);
    }

    #[test]
    fn test_split_rust_code() {
        let chunker = SemanticChunker::new(200, 20, vec!["\nfn ".to_string(), "\n\n".to_string()]);
        let text = r#"
fn foo() {
    println!("foo");
}

fn bar() {
    println!("bar");
}

fn baz() {
    println!("baz");
}
"#;

        let chunks = chunker.split(text);

        // Should preserve function boundaries
        assert!(!chunks.is_empty());
        for chunk in &chunks {
            // Each chunk should be valid
            assert!(!chunk.content.is_empty());
        }
    }

    #[test]
    fn test_chunk_overlap() {
        let chunker = SemanticChunker::new(50, 10, vec![" ".to_string()]);
        let text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word12";

        let chunks = chunker.split(text);

        // With overlap, subsequent chunks should share some content
        if chunks.len() > 1 {
            // Check that chunks have reasonable sizes
            for chunk in &chunks {
                assert!(chunk.content.len() <= chunker.chunk_size + 20); // Some tolerance
            }
        }
    }

    #[test]
    fn test_chunk_line_tracking() {
        let chunker = SemanticChunker::new(50, 5, vec!["\n".to_string()]);
        let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";

        let chunks = chunker.split(text);

        // First chunk should start at line 1
        assert_eq!(chunks[0].start_line, 1);
    }

    #[test]
    fn test_chunk_content_hash_deterministic() {
        let chunk1 = Chunk {
            content: "test content".to_string(),
            start_offset: 0,
            end_offset: 12,
            start_line: 1,
            end_line: 1,
        };
        let chunk2 = Chunk {
            content: "test content".to_string(),
            start_offset: 100, // Different offset, same content
            end_offset: 112,
            start_line: 5,
            end_line: 5,
        };

        assert_eq!(chunk1.content_hash(), chunk2.content_hash());
    }

    #[test]
    fn test_chunk_content_hash_different() {
        let chunk1 = Chunk {
            content: "content 1".to_string(),
            start_offset: 0,
            end_offset: 9,
            start_line: 1,
            end_line: 1,
        };
        let chunk2 = Chunk {
            content: "content 2".to_string(),
            start_offset: 0,
            end_offset: 9,
            start_line: 1,
            end_line: 1,
        };

        assert_ne!(chunk1.content_hash(), chunk2.content_hash());
    }

    #[test]
    fn test_empty_text() {
        let chunker = SemanticChunker::default();
        let chunks = chunker.split("");
        assert!(chunks.is_empty());
    }

    #[test]
    fn test_whitespace_only() {
        let chunker = SemanticChunker::default();
        let chunks = chunker.split("   \n\n   \t   ");
        assert!(chunks.is_empty());
    }

    #[test]
    fn test_config_round_trip() {
        let chunker = SemanticChunker::new(256, 32, vec!["\n".to_string()]);
        let config = chunker.config();

        assert_eq!(config.chunk_size, 256);
        assert_eq!(config.chunk_overlap, 32);
    }

    #[test]
    fn test_large_document_chunking() {
        let chunker = SemanticChunker::new(100, 20, SemanticChunker::default_separators());

        // Create a large document
        let mut text = String::new();
        for i in 0..50 {
            text.push_str(&format!(
                "\n## Section {}\n\nThis is paragraph {} with some content.\n",
                i, i
            ));
        }

        let chunks = chunker.split(&text);

        // Should produce multiple chunks
        assert!(chunks.len() > 1);

        // All chunks should be non-empty
        for chunk in &chunks {
            assert!(!chunk.content.trim().is_empty());
        }
    }

    // Property-based tests for semantic chunker
    mod proptests {
        use super::*;
        use proptest::prelude::*;

        // Strategy for generating reasonable chunk sizes
        fn chunk_size_strategy() -> impl Strategy<Value = usize> {
            32usize..=1024
        }

        // Strategy for generating reasonable overlap (must be less than chunk size)
        fn overlap_strategy(chunk_size: usize) -> impl Strategy<Value = usize> {
            0..=(chunk_size / 2)
        }

        proptest! {
            #![proptest_config(ProptestConfig::with_cases(50))]

            /// Property: Chunking any text produces chunks whose content equals the original
            #[test]
            fn prop_chunking_preserves_content(text in ".{0,500}") {
                let chunker = SemanticChunker::default();
                let chunks = chunker.split(&text);

                // If there are chunks, they should contain content from the original
                for chunk in &chunks {
                    // Each chunk content should be found in original text (accounting for overlap)
                    prop_assert!(text.contains(chunk.content.trim()) || chunk.content.trim().is_empty());
                }
            }

            /// Property: All chunks have valid line numbers (start <= end)
            #[test]
            fn prop_chunk_lines_valid(text in ".{1,500}\n.{1,500}\n.{1,500}") {
                let chunker = SemanticChunker::default();
                let chunks = chunker.split(&text);

                for chunk in &chunks {
                    prop_assert!(chunk.start_line <= chunk.end_line,
                        "start_line {} > end_line {}", chunk.start_line, chunk.end_line);
                    prop_assert!(chunk.start_line >= 1,
                        "start_line {} should be >= 1", chunk.start_line);
                }
            }

            /// Property: Chunk offsets are valid (start <= end, within bounds)
            #[test]
            fn prop_chunk_offsets_valid(text in ".{10,500}") {
                let chunker = SemanticChunker::default();
                let chunks = chunker.split(&text);

                for chunk in &chunks {
                    prop_assert!(chunk.start_offset <= chunk.end_offset,
                        "start_offset {} > end_offset {}", chunk.start_offset, chunk.end_offset);
                    prop_assert!(chunk.end_offset <= text.len(),
                        "end_offset {} > text.len() {}", chunk.end_offset, text.len());
                }
            }

            /// Property: Content hashes are deterministic
            #[test]
            fn prop_content_hash_deterministic(content in ".{1,100}") {
                let chunk = Chunk {
                    content: content.clone(),
                    start_offset: 0,
                    end_offset: content.len(),
                    start_line: 1,
                    end_line: 1,
                };

                let hash1 = chunk.content_hash();
                let hash2 = chunk.content_hash();
                prop_assert_eq!(hash1, hash2);
            }

            /// Property: Different content produces different hashes (with high probability)
            #[test]
            fn prop_content_hash_different(
                content1 in "[a-z]{5,20}",
                content2 in "[A-Z]{5,20}"
            ) {
                // Ensure contents are different
                if content1 != content2 {
                    let chunk1 = Chunk {
                        content: content1.clone(),
                        start_offset: 0,
                        end_offset: content1.len(),
                        start_line: 1,
                        end_line: 1,
                    };
                    let chunk2 = Chunk {
                        content: content2.clone(),
                        start_offset: 0,
                        end_offset: content2.len(),
                        start_line: 1,
                        end_line: 1,
                    };

                    prop_assert_ne!(chunk1.content_hash(), chunk2.content_hash());
                }
            }

            /// Property: Custom chunk sizes are respected (approximately)
            #[test]
            fn prop_chunk_size_respected(
                chunk_size in chunk_size_strategy(),
                text_len in 100usize..2000
            ) {
                let overlap = chunk_size / 4;
                let chunker = SemanticChunker::new(chunk_size, overlap, vec![" ".to_string()]);

                // Generate text of specified length
                let text: String = (0..text_len).map(|i| if i % 10 == 0 { ' ' } else { 'a' }).collect();
                let chunks = chunker.split(&text);

                // Most chunks should be close to chunk_size (with some tolerance)
                for chunk in &chunks {
                    // Allow 2x tolerance for edge cases
                    prop_assert!(chunk.content.len() <= chunk_size * 2,
                        "chunk len {} > 2 * chunk_size {}", chunk.content.len(), chunk_size * 2);
                }
            }
        }
    }
}