Skip to main content

anno/backends/
semantic_chunking.rs

1//! Chunking helpers for long text.
2//!
3//! **Status**:
4//! - By default, this module provides a lightweight **rule-based** chunker (paragraph boundaries
5//!   + size limits + overlap).
6//! - With the `semantic-chunking` feature enabled, this module additionally provides a
7//!   sentence-level similarity chunker (token-based Jaccard similarity; no embedding model
8//!   required).
9//!
10//! This keeps chunking behavior explicit without implying that embeddings are in use.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use anno::backends::semantic_chunking::{SemanticChunker, SemanticChunkConfig};
16//!
17//! let config = SemanticChunkConfig::default();
18//! let chunker = anno::backends::semantic_chunking::create_semantic_chunker(config)?;
19//! let chunks = chunker.chunk(long_text, Some("en"))?;
20//!
21//! for chunk in chunks {
22//!     println!("Chunk: {} ({} chars)", chunk.text, chunk.text.len());
23//! }
24//! ```
25
26use crate::Result;
27#[cfg(feature = "semantic-chunking")]
28use std::collections::BTreeSet;
29
30/// Configuration for semantic chunking.
31#[derive(Debug, Clone)]
32pub struct SemanticChunkConfig {
33    /// Target chunk size in characters (soft limit)
34    pub target_size: usize,
35    /// Minimum chunk size in characters (hard limit)
36    pub min_size: usize,
37    /// Maximum chunk size in characters (hard limit)
38    pub max_size: usize,
39    /// Similarity threshold for chunk boundaries (0.0-1.0)
40    /// Lower = more chunks, Higher = fewer chunks
41    pub similarity_threshold: f32,
42    /// Overlap between chunks in characters
43    pub overlap: usize,
44    /// Use sentence boundaries as fallback when similarity is ambiguous
45    pub fallback_to_sentences: bool,
46}
47
48impl Default for SemanticChunkConfig {
49    fn default() -> Self {
50        Self {
51            target_size: 10_000,
52            min_size: 1_000,
53            max_size: 20_000,
54            similarity_threshold: 0.7,
55            overlap: 200,
56            fallback_to_sentences: true,
57        }
58    }
59}
60
61impl SemanticChunkConfig {
62    /// Create config optimized for long documents.
63    pub fn long_document() -> Self {
64        Self {
65            target_size: 50_000,
66            min_size: 5_000,
67            max_size: 100_000,
68            similarity_threshold: 0.75,
69            overlap: 500,
70            fallback_to_sentences: true,
71        }
72    }
73
74    /// Create config for coreference resolution (smaller chunks, higher similarity).
75    pub fn coreference() -> Self {
76        Self {
77            target_size: 5_000,
78            min_size: 500,
79            max_size: 10_000,
80            similarity_threshold: 0.8, // Higher = keep related mentions together
81            overlap: 300,
82            fallback_to_sentences: true,
83        }
84    }
85}
86
87/// A semantically coherent chunk of text.
88#[derive(Debug, Clone)]
89pub struct SemanticChunk {
90    /// The text content of this chunk
91    pub text: String,
92    /// Starting character offset in original text
93    pub start: usize,
94    /// Ending character offset in original text
95    pub end: usize,
96    /// Optional topic label (if available)
97    pub topic: Option<String>,
98    /// Semantic similarity score with previous chunk (if available)
99    pub similarity_to_prev: Option<f32>,
100}
101
102/// Trait for semantic chunking strategies.
103pub trait SemanticChunker: Send + Sync {
104    /// Chunk text based on semantic similarity.
105    ///
106    /// Returns chunks sorted by position in the original text.
107    fn chunk(&self, text: &str, language: Option<&str>) -> Result<Vec<SemanticChunk>>;
108}
109
110/// Simple rule-based semantic chunker (fallback when embeddings unavailable).
111///
112/// Uses paragraph boundaries and sentence clustering as a lightweight alternative
113/// to embedding-based chunking.
114#[derive(Debug)]
115pub struct RuleBasedSemanticChunker {
116    config: SemanticChunkConfig,
117}
118
119impl RuleBasedSemanticChunker {
120    /// Create a new rule-based semantic chunker.
121    pub fn new(config: SemanticChunkConfig) -> Self {
122        Self { config }
123    }
124}
125
126fn char_to_byte_map(text: &str) -> Vec<usize> {
127    // Map char offsets (Unicode scalar index) -> byte offsets.
128    //
129    // We store the byte index of each char boundary, plus a final sentinel at text.len().
130    let mut map = Vec::with_capacity(text.chars().count().saturating_add(1));
131    for (b, _) in text.char_indices() {
132        map.push(b);
133    }
134    map.push(text.len());
135    map
136}
137
138fn byte_at_char(char_to_byte: &[usize], char_idx: usize) -> usize {
139    // Safe-ish clamp: callers should only provide 0..=len_chars.
140    if char_to_byte.is_empty() {
141        return 0;
142    }
143    let i = char_idx.min(char_to_byte.len() - 1);
144    char_to_byte[i]
145}
146
147fn paragraph_ranges(text: &str) -> Vec<(usize, usize)> {
148    // Paragraphs are groups of non-blank lines separated by at least one blank line.
149    //
150    // This is language-agnostic but formatting-dependent: it assumes newline structure is meaningful.
151    // We define "blank" as a line that contains only whitespace (spaces/tabs) plus newline markers.
152    //
153    // Returned ranges are in character offsets [start, end), preserving original text.
154    let mut out = Vec::new();
155    let mut para_start: Option<usize> = None;
156    let mut line_start = 0usize;
157    let mut line_has_non_ws = false;
158
159    let mut i = 0usize;
160    for c in text.chars() {
161        match c {
162            '\n' => {
163                // End of line at char offset i (exclusive of '\n'; the newline itself is part of
164                // the source text but not part of the line content).
165                if line_has_non_ws {
166                    if para_start.is_none() {
167                        para_start = Some(line_start);
168                    }
169                } else if let Some(ps) = para_start {
170                    // Blank line closes paragraph at the start of this blank line.
171                    out.push((ps, line_start));
172                    para_start = None;
173                }
174                i += 1;
175                line_start = i;
176                line_has_non_ws = false;
177            }
178            '\r' => {
179                // Treat CR as whitespace. If the text uses CRLF, the '\n' branch will close lines.
180                i += 1;
181            }
182            ' ' | '\t' => {
183                i += 1;
184            }
185            _ => {
186                line_has_non_ws = true;
187                i += 1;
188            }
189        }
190    }
191
192    // Final line: if it had content, ensure paragraph is opened.
193    if line_has_non_ws && para_start.is_none() {
194        para_start = Some(line_start);
195    }
196    if let Some(ps) = para_start {
197        out.push((ps, i));
198    }
199    out
200}
201
202impl SemanticChunker for RuleBasedSemanticChunker {
203    fn chunk(&self, text: &str, language: Option<&str>) -> Result<Vec<SemanticChunk>> {
204        let _ = language; // Acknowledge parameter for future use
205
206        if text.is_empty() {
207            return Ok(Vec::new());
208        }
209
210        let char_to_byte = char_to_byte_map(text);
211        let text_len_chars = char_to_byte.len().saturating_sub(1);
212
213        // Paragraph detection is formatting-based; if we can't detect paragraphs, fall back to a
214        // single range over the whole document.
215        let mut paras = paragraph_ranges(text);
216        if paras.is_empty() {
217            paras = vec![(0, text_len_chars)];
218        }
219
220        // Build chunk ranges in char offsets.
221        let mut ranges: Vec<(usize, usize)> = Vec::new();
222        let mut cur_start: Option<usize> = None;
223        let mut cur_end: usize = 0;
224
225        for (p_start, p_end) in paras {
226            if p_end <= p_start {
227                continue;
228            }
229            if cur_start.is_none() {
230                cur_start = Some(p_start);
231                cur_end = p_start;
232            }
233
234            // Would adding this paragraph exceed max_size?
235            let next_end = cur_end.max(p_end);
236            if let Some(cs) = cur_start {
237                let cur_len = next_end.saturating_sub(cs);
238                if cur_len > self.config.max_size && cur_end > cs {
239                    // Flush current chunk at cur_end.
240                    ranges.push((cs, cur_end));
241                    // Start new chunk with overlap.
242                    let overlap_start = cur_end.saturating_sub(self.config.overlap);
243                    cur_start = Some(overlap_start.min(p_start));
244                    cur_end = cur_start.unwrap();
245                }
246            }
247
248            // Extend current chunk to include this paragraph.
249            cur_end = cur_end.max(p_end);
250
251            // Soft flush near target size when we're already at/over target and we just ended a paragraph.
252            if let Some(cs) = cur_start {
253                let cur_len = cur_end.saturating_sub(cs);
254                if cur_len >= self.config.target_size && cur_len >= self.config.min_size {
255                    ranges.push((cs, cur_end));
256                    let overlap_start = cur_end.saturating_sub(self.config.overlap);
257                    cur_start = Some(overlap_start.min(cur_end));
258                    cur_end = cur_start.unwrap();
259                }
260            }
261        }
262
263        if let Some(cs) = cur_start {
264            if cur_end > cs {
265                ranges.push((cs, cur_end));
266            }
267        }
268
269        // Merge too-small chunks into the previous chunk by extending the previous range.
270        let mut merged: Vec<(usize, usize)> = Vec::new();
271        for (s, e) in ranges {
272            let len = e.saturating_sub(s);
273            if len < self.config.min_size && !merged.is_empty() {
274                let last = merged.last_mut().unwrap();
275                last.1 = last.1.max(e);
276            } else {
277                merged.push((s, e));
278            }
279        }
280
281        // Materialize chunks from original text slices to preserve offsets/text exactly.
282        let mut out = Vec::new();
283        for (s, e) in merged {
284            let sb = byte_at_char(&char_to_byte, s);
285            let eb = byte_at_char(&char_to_byte, e);
286            if eb <= sb {
287                continue;
288            }
289            let chunk_text = text.get(sb..eb).unwrap_or("").to_string();
290            if chunk_text.trim().is_empty() {
291                continue;
292            }
293            out.push(SemanticChunk {
294                text: chunk_text,
295                start: s,
296                end: e,
297                topic: None,
298                similarity_to_prev: None,
299            });
300        }
301
302        Ok(out)
303    }
304}
305
306/// Sentence-similarity chunker (feature = `semantic-chunking`).
307///
308/// Uses sentence-level similarity to identify coarse boundaries.
309///
310/// Despite the name, the current implementation does **not** use embeddings: it uses a
311/// sentence-level token Jaccard similarity to decide boundaries. This keeps the feature gate and
312/// config surface stable while avoiding heavyweight dependencies.
313#[cfg(feature = "semantic-chunking")]
314#[derive(Debug)]
315pub struct EmbeddingSemanticChunker {
316    config: SemanticChunkConfig,
317    // TODO: Add embedding model when available
318    // embedding_model: Box<dyn EmbeddingModel>,
319}
320
321#[cfg(feature = "semantic-chunking")]
322impl EmbeddingSemanticChunker {
323    /// Create a new embedding-based semantic chunker.
324    pub fn new(config: SemanticChunkConfig) -> Result<Self> {
325        Ok(Self { config })
326    }
327
328    fn tokenize_for_similarity(s: &str) -> BTreeSet<String> {
329        // Keep this intentionally simple and dependency-light.
330        //
331        // - lowercase (ASCII)
332        // - scrub non-alphanumeric to spaces
333        // - split on whitespace
334        // - drop very short tokens (noise)
335        let mut t = String::with_capacity(s.len());
336        for c in s.chars() {
337            if c.is_alphanumeric() {
338                t.push(c.to_ascii_lowercase());
339            } else {
340                t.push(' ');
341            }
342        }
343        t.split_whitespace()
344            .filter(|w| w.chars().count() > 2)
345            .map(|w| w.to_string())
346            .collect()
347    }
348
349    fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f32 {
350        if a.is_empty() && b.is_empty() {
351            return 1.0;
352        }
353        if a.is_empty() || b.is_empty() {
354            return 0.0;
355        }
356        let inter = a.intersection(b).count() as f32;
357        let uni = a.union(b).count() as f32;
358        if uni <= 0.0 {
359            0.0
360        } else {
361            inter / uni
362        }
363    }
364
365    fn char_to_byte_map(text: &str) -> Vec<usize> {
366        super::semantic_chunking::char_to_byte_map(text)
367    }
368
369    fn byte_at_char(map: &[usize], char_idx: usize) -> usize {
370        super::semantic_chunking::byte_at_char(map, char_idx)
371    }
372
373    fn split_sentences_spans(text: &str) -> Vec<(usize, usize)> {
374        // Return (start_char, end_char) spans for coarse sentence segments.
375        let terminators = [
376            '.', '!', '?', // Latin
377            '。', '!', '?', // CJK
378            '؟', '۔', // Arabic/Urdu
379            '।', // Devanagari
380        ];
381        let mut out = Vec::new();
382        let mut start = 0usize;
383        let mut i = 0usize;
384        for c in text.chars() {
385            i += 1;
386            if terminators.contains(&c) {
387                if i > start {
388                    out.push((start, i));
389                }
390                start = i;
391            }
392        }
393        if i > start {
394            out.push((start, i));
395        }
396        out
397    }
398}
399
400#[cfg(feature = "semantic-chunking")]
401impl SemanticChunker for EmbeddingSemanticChunker {
402    fn chunk(&self, text: &str, language: Option<&str>) -> Result<Vec<SemanticChunk>> {
403        let _ = language;
404        let t = text.trim();
405        if t.is_empty() {
406            return Ok(vec![]);
407        }
408
409        let spans = Self::split_sentences_spans(text);
410        if spans.is_empty() {
411            let fallback = RuleBasedSemanticChunker::new(self.config.clone());
412            return fallback.chunk(text, None);
413        }
414
415        let char_to_byte = Self::char_to_byte_map(text);
416
417        let mut chunks: Vec<SemanticChunk> = Vec::new();
418        let mut chunk_start_char = spans[0].0;
419        let mut chunk_end_char = spans[0].1;
420        let mut prev_sentence_tokens: Option<BTreeSet<String>> = None;
421        let mut prev_chunk_similarity: Option<f32> = None;
422
423        for (idx, (s0, s1)) in spans.iter().copied().enumerate() {
424            let sent_start = s0;
425            let sent_end = s1;
426
427            let sent_bytes_start = Self::byte_at_char(&char_to_byte, sent_start);
428            let sent_bytes_end = Self::byte_at_char(&char_to_byte, sent_end);
429            let sent_text = text
430                .get(sent_bytes_start..sent_bytes_end)
431                .unwrap_or("")
432                .trim();
433
434            if sent_text.is_empty() {
435                continue;
436            }
437
438            let tokens = Self::tokenize_for_similarity(sent_text);
439            let sim_to_prev_sentence = prev_sentence_tokens
440                .as_ref()
441                .map(|p| Self::jaccard(p, &tokens));
442
443            // Decide whether to cut before this sentence.
444            if idx > 0 {
445                let cur_len = chunk_end_char.saturating_sub(chunk_start_char);
446                let would_len = sent_end.saturating_sub(chunk_start_char);
447                let similarity_break = sim_to_prev_sentence
448                    .map(|s| s < self.config.similarity_threshold)
449                    .unwrap_or(false);
450                let would_exceed =
451                    would_len > self.config.max_size && cur_len >= self.config.min_size;
452
453                if (similarity_break && cur_len >= self.config.min_size) || would_exceed {
454                    let start_b = Self::byte_at_char(&char_to_byte, chunk_start_char);
455                    let end_b = Self::byte_at_char(&char_to_byte, chunk_end_char);
456                    let chunk_text = text.get(start_b..end_b).unwrap_or("").trim().to_string();
457                    if !chunk_text.is_empty() {
458                        chunks.push(SemanticChunk {
459                            text: chunk_text,
460                            start: chunk_start_char,
461                            end: chunk_end_char,
462                            topic: None,
463                            similarity_to_prev: prev_chunk_similarity,
464                        });
465                    }
466
467                    // Start new chunk, with optional overlap.
468                    let overlap_start_char = chunk_end_char
469                        .saturating_sub(self.config.overlap)
470                        .min(sent_start);
471                    chunk_start_char = overlap_start_char;
472                    prev_chunk_similarity = sim_to_prev_sentence;
473                }
474            }
475
476            // Extend chunk end to cover this sentence.
477            chunk_end_char = sent_end;
478            prev_sentence_tokens = Some(tokens);
479        }
480
481        // Final chunk.
482        if chunk_end_char > chunk_start_char {
483            let start_b = Self::byte_at_char(&char_to_byte, chunk_start_char);
484            let end_b = Self::byte_at_char(&char_to_byte, chunk_end_char);
485            let chunk_text = text.get(start_b..end_b).unwrap_or("").trim().to_string();
486            if !chunk_text.is_empty() {
487                chunks.push(SemanticChunk {
488                    text: chunk_text,
489                    start: chunk_start_char,
490                    end: chunk_end_char,
491                    topic: None,
492                    similarity_to_prev: prev_chunk_similarity,
493                });
494            }
495        }
496
497        if chunks.is_empty() {
498            let fallback = RuleBasedSemanticChunker::new(self.config.clone());
499            return fallback.chunk(text, None);
500        }
501
502        Ok(chunks)
503    }
504}
505
506/// Factory function to create appropriate chunker based on available features.
507pub fn create_semantic_chunker(config: SemanticChunkConfig) -> Result<Box<dyn SemanticChunker>> {
508    #[cfg(feature = "semantic-chunking")]
509    {
510        // Try embedding-based chunker first
511        match EmbeddingSemanticChunker::new(config.clone()) {
512            Ok(chunker) => Ok(Box::new(chunker)),
513            Err(_) => Ok(Box::new(RuleBasedSemanticChunker::new(config))),
514        }
515    }
516
517    #[cfg(not(feature = "semantic-chunking"))]
518    {
519        // Fall back to rule-based
520        Ok(Box::new(RuleBasedSemanticChunker::new(config)))
521    }
522}
523
524#[cfg(test)]
525mod tests {
526    use super::*;
527
528    #[test]
529    fn test_rule_based_chunker() {
530        let config = SemanticChunkConfig {
531            target_size: 100,
532            min_size: 50,
533            max_size: 200,
534            similarity_threshold: 0.7,
535            overlap: 20,
536            fallback_to_sentences: true,
537        };
538
539        let chunker = RuleBasedSemanticChunker::new(config);
540        let text = "Paragraph one.\n\nParagraph two.\n\nParagraph three.";
541        let chunks = chunker.chunk(text, None).unwrap();
542
543        assert!(!chunks.is_empty());
544        assert_eq!(chunks[0].start, 0);
545    }
546
547    #[test]
548    fn test_chunker_respects_min_size() {
549        let config = SemanticChunkConfig {
550            target_size: 1000,
551            min_size: 100,
552            max_size: 2000,
553            similarity_threshold: 0.7,
554            overlap: 50,
555            fallback_to_sentences: true,
556        };
557
558        let chunker = RuleBasedSemanticChunker::new(config);
559        let text = "Short.\n\nAlso short.";
560        let chunks = chunker.chunk(text, None).unwrap();
561
562        // Small chunks should be merged
563        assert!(chunks.len() <= 1 || chunks[0].text.chars().count() >= 100);
564    }
565}