Skip to main content

graphrag_core/nlp/
semantic_chunking.rs

1//! Semantic Chunking
2//!
3//! This module provides intelligent text chunking strategies that respect:
4//! - Sentence boundaries
5//! - Paragraph structure
6//! - Topic coherence
7//! - Semantic similarity
8//!
9//! ## Chunking Strategies
10//!
11//! 1. **Sentence-based**: Chunks at sentence boundaries
12//! 2. **Paragraph-based**: Chunks at paragraph breaks
13//! 3. **Topic-based**: Chunks when topic shifts detected
14//! 4. **Semantic**: Chunks based on embedding similarity
15//! 5. **Hybrid**: Combines multiple strategies
16
17use serde::{Deserialize, Serialize};
18
19/// Chunking strategy
20#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
21pub enum ChunkingStrategy {
22    /// Fixed-size chunks (character count)
23    FixedSize,
24    /// Sentence boundary-based chunks
25    Sentence,
26    /// Paragraph boundary-based chunks
27    Paragraph,
28    /// Topic shift detection
29    Topic,
30    /// Semantic similarity-based
31    Semantic,
32    /// Hybrid approach (combines multiple strategies)
33    Hybrid,
34}
35
36/// Chunking configuration
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct ChunkingConfig {
39    /// Chunking strategy
40    pub strategy: ChunkingStrategy,
41    /// Target chunk size (characters or sentences)
42    pub target_size: usize,
43    /// Minimum chunk size
44    pub min_size: usize,
45    /// Maximum chunk size
46    pub max_size: usize,
47    /// Overlap size (characters or sentences)
48    pub overlap: usize,
49    /// Similarity threshold for semantic chunking (0.0 to 1.0)
50    pub similarity_threshold: f32,
51}
52
53impl Default for ChunkingConfig {
54    fn default() -> Self {
55        Self {
56            strategy: ChunkingStrategy::Sentence,
57            target_size: 500,
58            min_size: 100,
59            max_size: 1000,
60            overlap: 50,
61            similarity_threshold: 0.7,
62        }
63    }
64}
65
66/// Text chunk with metadata
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct SemanticChunk {
69    /// Chunk text
70    pub text: String,
71    /// Start position in original text
72    pub start: usize,
73    /// End position in original text
74    pub end: usize,
75    /// Sentence count in chunk
76    pub sentence_count: usize,
77    /// Paragraph count in chunk
78    pub paragraph_count: usize,
79    /// Coherence score (higher = more coherent)
80    pub coherence: f32,
81}
82
83/// Semantic chunker
84pub struct SemanticChunker {
85    config: ChunkingConfig,
86}
87
88impl SemanticChunker {
89    /// Create new semantic chunker
90    pub fn new(config: ChunkingConfig) -> Self {
91        Self { config }
92    }
93
94    /// Create with default configuration
95    pub fn default_config() -> Self {
96        Self {
97            config: ChunkingConfig::default(),
98        }
99    }
100
101    /// Chunk text according to strategy
102    pub fn chunk(&self, text: &str) -> Vec<SemanticChunk> {
103        match self.config.strategy {
104            ChunkingStrategy::FixedSize => self.chunk_fixed_size(text),
105            ChunkingStrategy::Sentence => self.chunk_by_sentences(text),
106            ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(text),
107            ChunkingStrategy::Topic => self.chunk_by_topic(text),
108            ChunkingStrategy::Semantic => self.chunk_by_similarity(text),
109            ChunkingStrategy::Hybrid => self.chunk_hybrid(text),
110        }
111    }
112
113    /// Fixed-size chunking (baseline)
114    fn chunk_fixed_size(&self, text: &str) -> Vec<SemanticChunk> {
115        let mut chunks = Vec::new();
116        let chars: Vec<char> = text.chars().collect();
117        let total_len = chars.len();
118        let mut start = 0;
119
120        while start < total_len {
121            let end = (start + self.config.target_size).min(total_len);
122            let chunk_text: String = chars[start..end].iter().collect();
123
124            chunks.push(SemanticChunk {
125                text: chunk_text,
126                start,
127                end,
128                sentence_count: 0, // Will be calculated if needed
129                paragraph_count: 0,
130                coherence: 1.0,
131            });
132
133            start += self.config.target_size - self.config.overlap;
134        }
135
136        chunks
137    }
138
139    /// Sentence-based chunking
140    fn chunk_by_sentences(&self, text: &str) -> Vec<SemanticChunk> {
141        let sentences = self.split_sentences(text);
142        let mut chunks = Vec::new();
143        let mut current_chunk = Vec::new();
144        let mut current_size = 0;
145        let mut chunk_start = 0;
146
147        for sentence in sentences.iter() {
148            let sentence_len = sentence.len();
149
150            // Check if adding this sentence exceeds max size
151            if current_size + sentence_len > self.config.max_size && !current_chunk.is_empty() {
152                // Create chunk from accumulated sentences
153                let chunk_text = current_chunk.join(" ");
154                let chunk_end = chunk_start + chunk_text.len();
155
156                chunks.push(SemanticChunk {
157                    text: chunk_text,
158                    start: chunk_start,
159                    end: chunk_end,
160                    sentence_count: current_chunk.len(),
161                    paragraph_count: self.count_paragraphs(&current_chunk.join(" ")),
162                    coherence: self.calculate_coherence(&current_chunk),
163                });
164
165                // Start new chunk with overlap
166                let overlap_sentences = if current_chunk.len() > 1 {
167                    vec![current_chunk.last().unwrap().clone()]
168                } else {
169                    Vec::new()
170                };
171
172                chunk_start = chunk_end - overlap_sentences.join(" ").len();
173                current_chunk = overlap_sentences;
174                current_size = current_chunk.iter().map(|s| s.len()).sum();
175            }
176
177            current_chunk.push(sentence.clone());
178            current_size += sentence_len;
179
180            // Check if we've reached target size
181            if current_size >= self.config.target_size {
182                let chunk_text = current_chunk.join(" ");
183                let chunk_end = chunk_start + chunk_text.len();
184
185                chunks.push(SemanticChunk {
186                    text: chunk_text,
187                    start: chunk_start,
188                    end: chunk_end,
189                    sentence_count: current_chunk.len(),
190                    paragraph_count: self.count_paragraphs(&current_chunk.join(" ")),
191                    coherence: self.calculate_coherence(&current_chunk),
192                });
193
194                // Start new chunk with overlap
195                let overlap_sentences = if current_chunk.len() > 1 {
196                    vec![current_chunk.last().unwrap().clone()]
197                } else {
198                    Vec::new()
199                };
200
201                chunk_start = chunk_end - overlap_sentences.join(" ").len();
202                current_chunk = overlap_sentences;
203                current_size = current_chunk.iter().map(|s| s.len()).sum();
204            }
205        }
206
207        // Add remaining sentences as final chunk
208        if !current_chunk.is_empty() && current_chunk.join(" ").len() >= self.config.min_size {
209            let chunk_text = current_chunk.join(" ");
210            let chunk_end = chunk_start + chunk_text.len();
211
212            chunks.push(SemanticChunk {
213                text: chunk_text,
214                start: chunk_start,
215                end: chunk_end,
216                sentence_count: current_chunk.len(),
217                paragraph_count: self.count_paragraphs(&current_chunk.join(" ")),
218                coherence: self.calculate_coherence(&current_chunk),
219            });
220        }
221
222        chunks
223    }
224
225    /// Paragraph-based chunking
226    fn chunk_by_paragraphs(&self, text: &str) -> Vec<SemanticChunk> {
227        let paragraphs: Vec<&str> = text.split("\n\n")
228            .filter(|p| !p.trim().is_empty())
229            .collect();
230
231        let mut chunks = Vec::new();
232        let mut current_chunk = Vec::new();
233        let mut current_size = 0;
234        let mut chunk_start = 0;
235
236        for paragraph in paragraphs {
237            let para_len = paragraph.len();
238
239            if current_size + para_len > self.config.max_size && !current_chunk.is_empty() {
240                // Create chunk
241                let chunk_text = current_chunk.join("\n\n");
242                let chunk_end = chunk_start + chunk_text.len();
243
244                chunks.push(SemanticChunk {
245                    text: chunk_text.clone(),
246                    start: chunk_start,
247                    end: chunk_end,
248                    sentence_count: self.count_sentences(&chunk_text),
249                    paragraph_count: current_chunk.len(),
250                    coherence: self.calculate_coherence(&current_chunk),
251                });
252
253                chunk_start = chunk_end;
254                current_chunk = Vec::new();
255                current_size = 0;
256            }
257
258            current_chunk.push(paragraph.to_string());
259            current_size += para_len;
260        }
261
262        // Add remaining chunk
263        if !current_chunk.is_empty() {
264            let chunk_text = current_chunk.join("\n\n");
265            let chunk_end = chunk_start + chunk_text.len();
266
267            chunks.push(SemanticChunk {
268                text: chunk_text.clone(),
269                start: chunk_start,
270                end: chunk_end,
271                sentence_count: self.count_sentences(&chunk_text),
272                paragraph_count: current_chunk.len(),
273                coherence: self.calculate_coherence(&current_chunk),
274            });
275        }
276
277        chunks
278    }
279
280    /// Topic-based chunking (simplified TextTiling algorithm)
281    fn chunk_by_topic(&self, text: &str) -> Vec<SemanticChunk> {
282        let sentences = self.split_sentences(text);
283        let mut chunks = Vec::new();
284
285        // Calculate lexical cohesion scores between adjacent sentences
286        let mut boundaries = vec![0]; // Start of text is always a boundary
287
288        for i in 1..sentences.len() {
289            let cohesion = self.lexical_cohesion(&sentences[i-1], &sentences[i]);
290
291            // If cohesion is low, mark as potential boundary
292            if cohesion < self.config.similarity_threshold {
293                boundaries.push(i);
294            }
295        }
296
297        boundaries.push(sentences.len()); // End of text is always a boundary
298
299        // Create chunks from boundaries
300        let mut text_pos = 0;
301        for window in boundaries.windows(2) {
302            let start_idx = window[0];
303            let end_idx = window[1];
304
305            let chunk_sentences = &sentences[start_idx..end_idx];
306            let chunk_text = chunk_sentences.join(" ");
307            let chunk_len = chunk_text.len();
308
309            if chunk_len >= self.config.min_size {
310                chunks.push(SemanticChunk {
311                    text: chunk_text,
312                    start: text_pos,
313                    end: text_pos + chunk_len,
314                    sentence_count: chunk_sentences.len(),
315                    paragraph_count: self.count_paragraphs(&chunk_sentences.join(" ")),
316                    coherence: self.calculate_coherence(chunk_sentences),
317                });
318            }
319
320            text_pos += chunk_len;
321        }
322
323        chunks
324    }
325
326    /// Semantic similarity-based chunking
327    fn chunk_by_similarity(&self, text: &str) -> Vec<SemanticChunk> {
328        // For now, fall back to sentence-based chunking
329        // TODO: Implement proper embedding-based similarity chunking
330        // This requires:
331        // 1. Generate embeddings for each sentence
332        // 2. Calculate cosine similarity between adjacent sentences
333        // 3. Create boundaries where similarity drops below threshold
334        // 4. Merge small chunks that are below min_size
335
336        self.chunk_by_sentences(text)
337    }
338
339    /// Hybrid chunking strategy
340    fn chunk_hybrid(&self, text: &str) -> Vec<SemanticChunk> {
341        // Start with paragraph boundaries
342        let para_chunks = self.chunk_by_paragraphs(text);
343
344        // Further split large paragraphs by sentences
345        let mut final_chunks = Vec::new();
346
347        for chunk in para_chunks {
348            if chunk.text.len() > self.config.max_size {
349                // Split by sentences
350                let mut temp_config = self.config.clone();
351                temp_config.strategy = ChunkingStrategy::Sentence;
352                let sub_chunker = SemanticChunker::new(temp_config);
353                let sub_chunks = sub_chunker.chunk(&chunk.text);
354                final_chunks.extend(sub_chunks);
355            } else {
356                final_chunks.push(chunk);
357            }
358        }
359
360        final_chunks
361    }
362
363    /// Split text into sentences (simple heuristic)
364    fn split_sentences(&self, text: &str) -> Vec<String> {
365        let mut sentences = Vec::new();
366        let mut current = String::new();
367
368        for c in text.chars() {
369            current.push(c);
370
371            // Simple sentence boundary detection
372            if matches!(c, '.' | '!' | '?') {
373                if let Some(next) = current.chars().last() {
374                    if next.is_whitespace() || !current.trim().is_empty() {
375                        sentences.push(current.trim().to_string());
376                        current = String::new();
377                    }
378                }
379            }
380        }
381
382        // Add remaining text
383        if !current.trim().is_empty() {
384            sentences.push(current.trim().to_string());
385        }
386
387        sentences
388    }
389
390    /// Count sentences in text
391    fn count_sentences(&self, text: &str) -> usize {
392        text.chars().filter(|c| matches!(c, '.' | '!' | '?')).count()
393    }
394
395    /// Count paragraphs in text
396    fn count_paragraphs(&self, text: &str) -> usize {
397        text.split("\n\n").filter(|p| !p.trim().is_empty()).count().max(1)
398    }
399
400    /// Calculate lexical cohesion between two texts (word overlap)
401    fn lexical_cohesion(&self, text1: &str, text2: &str) -> f32 {
402        let text1_lower = text1.to_lowercase();
403        let words1: std::collections::HashSet<_> = text1_lower
404            .split_whitespace()
405            .collect();
406
407        let text2_lower = text2.to_lowercase();
408        let words2: std::collections::HashSet<_> = text2_lower
409            .split_whitespace()
410            .collect();
411
412        let intersection = words1.intersection(&words2).count();
413        let union = words1.union(&words2).count();
414
415        if union == 0 {
416            0.0
417        } else {
418            intersection as f32 / union as f32
419        }
420    }
421
422    /// Calculate coherence score for a chunk (based on word overlap)
423    fn calculate_coherence(&self, sentences: &[String]) -> f32 {
424        if sentences.len() < 2 {
425            return 1.0;
426        }
427
428        let mut total_cohesion = 0.0;
429        for window in sentences.windows(2) {
430            total_cohesion += self.lexical_cohesion(&window[0], &window[1]);
431        }
432
433        total_cohesion / (sentences.len() - 1) as f32
434    }
435}
436
437/// Chunking statistics
438#[derive(Debug, Clone, Serialize, Deserialize)]
439pub struct ChunkingStats {
440    /// Total chunks created
441    pub total_chunks: usize,
442    /// Average chunk size (characters)
443    pub avg_chunk_size: f32,
444    /// Minimum chunk size
445    pub min_chunk_size: usize,
446    /// Maximum chunk size
447    pub max_chunk_size: usize,
448    /// Average coherence score
449    pub avg_coherence: f32,
450    /// Average sentences per chunk
451    pub avg_sentences_per_chunk: f32,
452}
453
454impl ChunkingStats {
455    /// Calculate statistics from chunks
456    pub fn from_chunks(chunks: &[SemanticChunk]) -> Self {
457        if chunks.is_empty() {
458            return Self {
459                total_chunks: 0,
460                avg_chunk_size: 0.0,
461                min_chunk_size: 0,
462                max_chunk_size: 0,
463                avg_coherence: 0.0,
464                avg_sentences_per_chunk: 0.0,
465            };
466        }
467
468        let total_chunks = chunks.len();
469        let sizes: Vec<usize> = chunks.iter().map(|c| c.text.len()).collect();
470        let avg_chunk_size = sizes.iter().sum::<usize>() as f32 / total_chunks as f32;
471        let min_chunk_size = *sizes.iter().min().unwrap();
472        let max_chunk_size = *sizes.iter().max().unwrap();
473
474        let avg_coherence = chunks.iter().map(|c| c.coherence).sum::<f32>() / total_chunks as f32;
475        let avg_sentences_per_chunk = chunks.iter().map(|c| c.sentence_count).sum::<usize>() as f32
476            / total_chunks as f32;
477
478        Self {
479            total_chunks,
480            avg_chunk_size,
481            min_chunk_size,
482            max_chunk_size,
483            avg_coherence,
484            avg_sentences_per_chunk,
485        }
486    }
487}
488
489#[cfg(test)]
490mod tests {
491    use super::*;
492
493    const TEST_TEXT: &str = "This is the first sentence. This is the second sentence. \
494                              This is the third sentence.\n\n\
495                              This is a new paragraph with different content. \
496                              It has multiple sentences too. \
497                              And here is another one.";
498
499    #[test]
500    fn test_fixed_size_chunking() {
501        let config = ChunkingConfig {
502            strategy: ChunkingStrategy::FixedSize,
503            target_size: 50,
504            min_size: 10,
505            max_size: 100,
506            overlap: 10,
507            similarity_threshold: 0.7,
508        };
509
510        let chunker = SemanticChunker::new(config);
511        let chunks = chunker.chunk(TEST_TEXT);
512
513        assert!(!chunks.is_empty());
514        for chunk in &chunks {
515            assert!(chunk.text.len() <= 100);
516        }
517    }
518
519    #[test]
520    fn test_sentence_chunking() {
521        let config = ChunkingConfig {
522            strategy: ChunkingStrategy::Sentence,
523            target_size: 100,
524            min_size: 20,
525            max_size: 200,
526            overlap: 20,
527            similarity_threshold: 0.7,
528        };
529
530        let chunker = SemanticChunker::new(config);
531        let chunks = chunker.chunk(TEST_TEXT);
532
533        assert!(!chunks.is_empty());
534        for chunk in &chunks {
535            assert!(chunk.sentence_count > 0);
536            assert!(chunk.text.len() >= 20);
537        }
538    }
539
540    #[test]
541    fn test_paragraph_chunking() {
542        let config = ChunkingConfig {
543            strategy: ChunkingStrategy::Paragraph,
544            target_size: 100,
545            min_size: 20,
546            max_size: 500,
547            overlap: 0,
548            similarity_threshold: 0.7,
549        };
550
551        let chunker = SemanticChunker::new(config);
552        let chunks = chunker.chunk(TEST_TEXT);
553
554        assert!(!chunks.is_empty());
555        for chunk in &chunks {
556            assert!(chunk.paragraph_count > 0);
557        }
558    }
559
560    #[test]
561    fn test_topic_chunking() {
562        let config = ChunkingConfig {
563            strategy: ChunkingStrategy::Topic,
564            target_size: 100,
565            min_size: 20,
566            max_size: 300,
567            overlap: 0,
568            similarity_threshold: 0.3,
569        };
570
571        let chunker = SemanticChunker::new(config);
572        let chunks = chunker.chunk(TEST_TEXT);
573
574        assert!(!chunks.is_empty());
575    }
576
577    #[test]
578    fn test_hybrid_chunking() {
579        let config = ChunkingConfig {
580            strategy: ChunkingStrategy::Hybrid,
581            target_size: 100,
582            min_size: 20,
583            max_size: 150,
584            overlap: 10,
585            similarity_threshold: 0.7,
586        };
587
588        let chunker = SemanticChunker::new(config);
589        let chunks = chunker.chunk(TEST_TEXT);
590
591        assert!(!chunks.is_empty());
592    }
593
594    #[test]
595    fn test_chunking_stats() {
596        let chunker = SemanticChunker::default_config();
597        let chunks = chunker.chunk(TEST_TEXT);
598        let stats = ChunkingStats::from_chunks(&chunks);
599
600        assert_eq!(stats.total_chunks, chunks.len());
601        assert!(stats.avg_chunk_size > 0.0);
602        assert!(stats.avg_coherence >= 0.0 && stats.avg_coherence <= 1.0);
603    }
604
605    #[test]
606    fn test_sentence_splitting() {
607        let chunker = SemanticChunker::default_config();
608        let sentences = chunker.split_sentences("Hello world. How are you? I am fine!");
609
610        assert_eq!(sentences.len(), 3);
611        assert_eq!(sentences[0], "Hello world.");
612        assert_eq!(sentences[1], "How are you?");
613        assert_eq!(sentences[2], "I am fine!");
614    }
615
616    #[test]
617    fn test_lexical_cohesion() {
618        let chunker = SemanticChunker::default_config();
619
620        let cohesion1 = chunker.lexical_cohesion(
621            "The cat sat on the mat",
622            "The cat was very happy"
623        );
624        assert!(cohesion1 > 0.0);
625
626        let cohesion2 = chunker.lexical_cohesion(
627            "The cat sat on the mat",
628            "Quantum physics is complex"
629        );
630        assert!(cohesion2 < cohesion1);
631    }
632}